blob: 8a1f666014a0de1799f70f6c3606547e27c00f86 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
Marat Dukhan01849012020-04-27 19:28:32 -07009#include <math.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070010#include <stdbool.h>
11#include <stddef.h>
12#include <stdint.h>
Marat Dukhan04f03be2019-11-19 12:36:47 -080013#include <string.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070014
Marat Dukhan57133c02020-04-13 00:54:59 -070015#ifdef _WIN32
16 #include <windows.h>
17#else
18 #include <pthread.h>
19#endif
XNNPACK Teamb455b122019-09-27 18:10:33 -070020
Marat Dukhan496389f2021-04-07 15:47:12 -070021#ifdef _MSC_VER
22 #include <intrin.h>
23#endif
24
Marat Dukhand343c222019-10-07 09:22:14 -070025#ifndef __EMSCRIPTEN__
26 #include <cpuinfo.h>
27#endif
XNNPACK Teamb455b122019-09-27 18:10:33 -070028
29#include <xnnpack.h>
Marat Dukhan496389f2021-04-07 15:47:12 -070030#include <xnnpack/allocator.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070031#include <xnnpack/argmaxpool.h>
32#include <xnnpack/avgpool.h>
Marat Dukhan1dadbf72019-10-01 10:46:20 -070033#include <xnnpack/common.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070034#include <xnnpack/conv.h>
35#include <xnnpack/dwconv.h>
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -080036#include <xnnpack/depthtospace.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070037#include <xnnpack/gavgpool.h>
38#include <xnnpack/gemm.h>
Marat Dukhan4662b192020-05-21 15:52:03 -070039#include <xnnpack/fill.h>
Marat Dukhan660fd192020-03-10 04:55:30 -070040#include <xnnpack/ibilinear.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070041#include <xnnpack/igemm.h>
42#include <xnnpack/log.h>
43#include <xnnpack/lut.h>
44#include <xnnpack/maxpool.h>
45#include <xnnpack/pad.h>
46#include <xnnpack/params.h>
Marat Dukhanc5a7a392021-05-21 16:04:31 -070047#include <xnnpack/params-init.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070048#include <xnnpack/pavgpool.h>
49#include <xnnpack/prelu.h>
Marat Dukhan1edc4542020-01-27 12:40:13 -080050#include <xnnpack/raddstoreexpminusmax.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070051#include <xnnpack/rmax.h>
52#include <xnnpack/spmm.h>
53#include <xnnpack/unpool.h>
Marat Dukhan64287252021-09-07 16:20:03 -070054#include <xnnpack/vaddsub.h>
Marat Dukhan1e782c42019-11-21 17:02:40 -080055#include <xnnpack/vbinary.h>
Marat Dukhanaf2ba002021-10-24 14:21:41 -070056#include <xnnpack/vcvt.h>
Marat Dukhan0853b8a2021-08-03 01:01:53 -070057#include <xnnpack/vmul.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070058#include <xnnpack/vmulcaddc.h>
Marat Dukhan1e782c42019-11-21 17:02:40 -080059#include <xnnpack/vunary.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070060#include <xnnpack/zip.h>
61
62#ifndef XNN_ENABLE_ASSEMBLY
63 #define XNN_ENABLE_ASSEMBLY 1
64#endif
65
Zhi An Ng0db15d32021-12-10 16:45:06 -080066#if XNN_PLATFORM_WINDOWS
Marat Dukhan57133c02020-04-13 00:54:59 -070067 static INIT_ONCE init_guard = INIT_ONCE_STATIC_INIT;
68#else
69 static pthread_once_t init_guard = PTHREAD_ONCE_INIT;
70#endif
XNNPACK Teamb455b122019-09-27 18:10:33 -070071
Marat Dukhan496389f2021-04-07 15:47:12 -070072static const struct xnn_allocator* volatile init_allocator = NULL;
73
XNNPACK Teamb455b122019-09-27 18:10:33 -070074struct xnn_parameters xnn_params = {
Marat Dukhan854fb6b2020-06-19 12:33:44 -070075 .init_flags = 0
XNNPACK Teamb455b122019-09-27 18:10:33 -070076};
77
Marat Dukhan01849012020-04-27 19:28:32 -070078static void init(void) {
Marat Dukhan4c617792021-12-21 15:47:58 -080079#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
Marat Dukhan01849012020-04-27 19:28:32 -070080 // Unlike most other architectures, on x86/x86-64 when floating-point instructions
81 // have no NaN arguments, but produce NaN output, the output NaN has sign bit set.
82 // We use it to distinguish x86/x86-64 from other architectures, by doing subtraction
83 // of two infinities (must produce NaN per IEEE 754 standard).
84 static const volatile float inf = INFINITY;
85 const bool is_wasm_x86 = signbit(inf - inf);
XNNPACK Teamb455b122019-09-27 18:10:33 -070086#endif
Marat Dukhan854fb6b2020-06-19 12:33:44 -070087 uint32_t init_flags = XNN_INIT_FLAG_XNNPACK;
XNNPACK Teamb455b122019-09-27 18:10:33 -070088
Marat Dukhan1dadbf72019-10-01 10:46:20 -070089#if XNN_ARCH_ARM
Frank Barchardbcdb1c12020-05-11 14:13:20 -070090 #if XNN_PLATFORM_MOBILE
Marat Dukhan3b745a42020-05-10 21:43:25 -070091 if (!cpuinfo_has_arm_neon()) {
92 xnn_log_error("XNNPACK initialization failed: NEON is not supported");
93 return;
94 }
95 #else
96 if (!cpuinfo_has_arm_vfpv2() && !cpuinfo_has_arm_vfpv3()) {
97 xnn_log_error("XNNPACK initialization failed: VFP is not supported");
98 return;
99 }
100 #endif
XNNPACK Teamb455b122019-09-27 18:10:33 -0700101
Marat Dukhan3b745a42020-05-10 21:43:25 -0700102 if (cpuinfo_has_arm_neon()) {
Frank Barchardb40ee632021-12-30 11:10:02 -0800103 /**************************** QC8 AArch32 micro-kernels ****************************/
Marat Dukhan898d5852021-06-30 21:18:34 -0700104 #ifndef XNN_NO_QC8_OPERATORS
105 init_flags |= XNN_INIT_FLAG_QC8;
106
Frank Barchardf290a142022-01-05 01:08:37 -0800107 #if XNN_ENABLE_ASSEMBLY
108 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
Frank Barchardba5091f2022-01-25 13:31:26 -0800109 switch (cpuinfo_get_uarch(0)->uarch) {
110 case cpuinfo_uarch_cortex_a55:
111 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55);
Frank Barchard6cc5b482022-01-26 17:01:41 -0800112 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55);
Frank Barchardba5091f2022-01-25 13:31:26 -0800113 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
114 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
115 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
116 xnn_params.qc8.gemm.mr = 4;
117 xnn_params.qc8.gemm.nr = 8;
118 xnn_params.qc8.gemm.log2_kr = 2;
119 break;
120 default:
121 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64);
122 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64);
123 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
124 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
125 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
126 xnn_params.qc8.gemm.mr = 4;
127 xnn_params.qc8.gemm.nr = 8;
128 xnn_params.qc8.gemm.log2_kr = 2;
129 break;
130 }
Frank Barchardf290a142022-01-05 01:08:37 -0800131 } else {
132 switch (cpuinfo_get_uarch(0)->uarch) {
Frank Barchard101271e2022-02-02 01:49:54 -0800133 case cpuinfo_uarch_cortex_a7:
Frank Barchard2991acf2022-02-02 20:34:57 -0800134 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
Frank Barchard101271e2022-02-02 01:49:54 -0800135 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
136 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
137 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
138 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neon_params;
139 xnn_params.qc8.gemm.mr = 4;
140 xnn_params.qc8.gemm.nr = 8;
141 break;
Frank Barchard2991acf2022-02-02 20:34:57 -0800142 case cpuinfo_uarch_cortex_a35:
143 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7);
144 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
145 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
146 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
147 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neon_params;
148 xnn_params.qc8.gemm.mr = 4;
149 xnn_params.qc8.gemm.nr = 8;
150 break;
Frank Barchardf290a142022-01-05 01:08:37 -0800151 case cpuinfo_uarch_cortex_a53:
Frank Barchard0455acf2022-02-02 00:51:40 -0800152 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_cortex_a53);
153 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64);
154 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
155 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
156 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
157 xnn_params.qc8.gemm.mr = 4;
158 xnn_params.qc8.gemm.nr = 8;
159 break;
Frank Barchard101271e2022-02-02 01:49:54 -0800160 case cpuinfo_uarch_cortex_a55r0:
161 case cpuinfo_uarch_kryo:
162 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53);
163 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64);
164 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
165 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
166 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
167 xnn_params.qc8.gemm.mr = 4;
168 xnn_params.qc8.gemm.nr = 8;
169 break;
Frank Barchardf290a142022-01-05 01:08:37 -0800170 case cpuinfo_uarch_cortex_a72:
171 case cpuinfo_uarch_exynos_m1:
172 case cpuinfo_uarch_exynos_m2:
173 case cpuinfo_uarch_exynos_m3:
Frank Barchardf290a142022-01-05 01:08:37 -0800174 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64);
175 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64);
176 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
177 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
178 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
179 xnn_params.qc8.gemm.mr = 4;
180 xnn_params.qc8.gemm.nr = 8;
181 break;
182
183 default:
184 if (cpuinfo_has_arm_neon_v8()) {
185 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64);
186 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64);
187 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
188 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
189 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
190 xnn_params.qc8.gemm.mr = 4;
191 xnn_params.qc8.gemm.nr = 8;
192 } else {
Frank Barchardd2e8d4d2022-01-14 17:18:53 -0800193 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
194 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
195 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
196 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
Frank Barchardf290a142022-01-05 01:08:37 -0800197 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neon_params;
Frank Barchardd2e8d4d2022-01-14 17:18:53 -0800198 xnn_params.qc8.gemm.mr = 4;
Frank Barchardf290a142022-01-05 01:08:37 -0800199 xnn_params.qc8.gemm.nr = 8;
Frank Barchardf290a142022-01-05 01:08:37 -0800200 }
201 break;
202 }
203 }
Frank Barchardba5091f2022-01-25 13:31:26 -0800204 #if XNN_MAX_UARCH_TYPES > 1
205 {
206 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
207 const uint32_t mr = xnn_params.qc8.gemm.mr;
208 const uint32_t nr = xnn_params.qc8.gemm.nr;
209 const uint32_t log2_kr = xnn_params.qc8.gemm.log2_kr;
210 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
211 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
212 if (uarch_info == NULL) {
213 /* No more microarchitectures in the system */
214 break;
215 }
216
217 switch (uarch_info->uarch) {
218 case cpuinfo_uarch_cortex_a55:
219 if (mr == 4 && nr == 8 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
220 xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55;
Frank Barchard6cc5b482022-01-26 17:01:41 -0800221 xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55;
Frank Barchardba5091f2022-01-25 13:31:26 -0800222 xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot;
223 xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot;
224 }
225 break;
226 case cpuinfo_uarch_cortex_a53:
Frank Barchardba5091f2022-01-25 13:31:26 -0800227 if (mr == 4 && nr == 8 && log2_kr == 0) {
Frank Barchard0455acf2022-02-02 00:51:40 -0800228 xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_cortex_a53;
Frank Barchardba5091f2022-01-25 13:31:26 -0800229 xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64;
230 xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane;
231 xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane;
232 }
233 break;
Frank Barchard101271e2022-02-02 01:49:54 -0800234 case cpuinfo_uarch_cortex_a55r0:
235 if (mr == 4 && nr == 8 && log2_kr == 0) {
236 xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53;
237 xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64;
238 xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane;
239 xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane;
240 }
241 break;
242
Frank Barchardba5091f2022-01-25 13:31:26 -0800243 default:
244 break;
245 }
246 }
247 }
248 #endif // XNN_MAX_UARCH_TYPES > 1
Frank Barchardf290a142022-01-05 01:08:37 -0800249 #else // XNN_ENABLE_ASSEMBLY
250 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
Frank Barchard70137e42021-12-28 15:40:18 -0800251 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot);
Frank Barchard70137e42021-12-28 15:40:18 -0800252 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__neondot);
Frank Barchardf290a142022-01-05 01:08:37 -0800253 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
Frank Barchard70137e42021-12-28 15:40:18 -0800254 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
255 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
256 xnn_params.qc8.gemm.mr = 4;
257 xnn_params.qc8.gemm.nr = 8;
258 xnn_params.qc8.gemm.log2_kr = 2;
Frank Barchardf290a142022-01-05 01:08:37 -0800259 } else if (cpuinfo_has_arm_v8()) {
260 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
261 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
262 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
263 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
264 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
265 xnn_params.qc8.gemm.mr = 2;
266 xnn_params.qc8.gemm.nr = 8;
267 xnn_params.qc8.gemm.log2_kr = 1;
268 xnn_params.qc8.gemm.log2_sr = 2;
269 } else {
270 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal);
271 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal);
272 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal);
273 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal);
274 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neon_params;
275 xnn_params.qc8.gemm.mr = 2;
276 xnn_params.qc8.gemm.nr = 8;
277 xnn_params.qc8.gemm.log2_kr = 1;
278 xnn_params.qc8.gemm.log2_sr = 2;
279 }
280 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhan898d5852021-06-30 21:18:34 -0700281
Frank Barchard0bc58012021-11-22 18:12:05 -0800282 if (cpuinfo_has_arm_neon_v8()) {
283 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neonv8_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -0800284 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Frank Barchard0bc58012021-11-22 18:12:05 -0800285 xnn_params.qc8.dwconv[0].channel_tile = 16;
286 xnn_params.qc8.dwconv[0].primary_tile = 9;
287 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -0800288 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Frank Barchard0bc58012021-11-22 18:12:05 -0800289 xnn_params.qc8.dwconv[1].channel_tile = 8;
290 xnn_params.qc8.dwconv[1].primary_tile = 25;
291 } else {
292 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neon_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -0800293 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_neon_params;
Frank Barchard0bc58012021-11-22 18:12:05 -0800294 xnn_params.qc8.dwconv[0].channel_tile = 16;
295 xnn_params.qc8.dwconv[0].primary_tile = 9;
296 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neon_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -0800297 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_neon_params;
Frank Barchard0bc58012021-11-22 18:12:05 -0800298 xnn_params.qc8.dwconv[1].channel_tile = 8;
299 xnn_params.qc8.dwconv[1].primary_tile = 25;
300 }
Marat Dukhan898d5852021-06-30 21:18:34 -0700301 #endif // XNN_NO_QC8_OPERATORS
302
Frank Barchardb40ee632021-12-30 11:10:02 -0800303 /**************************** QS8 AArch32 micro-kernels ****************************/
Marat Dukhanf28cddf2020-08-10 14:05:02 -0700304 #ifndef XNN_NO_QS8_OPERATORS
305 init_flags |= XNN_INIT_FLAG_QS8;
306
Frank Barchard95198162021-12-21 17:29:10 -0800307 #if XNN_ENABLE_ASSEMBLY
308 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
Frank Barchard1228b3e2022-01-24 11:57:19 -0800309 switch (cpuinfo_get_uarch(0)->uarch) {
310 case cpuinfo_uarch_cortex_a55:
311 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55);
Frank Barchard6cc5b482022-01-26 17:01:41 -0800312 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55);
Frank Barchard1228b3e2022-01-24 11:57:19 -0800313 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
314 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
315 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
316 xnn_params.qs8.gemm.mr = 4;
317 xnn_params.qs8.gemm.nr = 8;
318 xnn_params.qs8.gemm.log2_kr = 2;
319 break;
320 default:
321 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64);
322 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64);
323 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
324 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
325 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
326 xnn_params.qs8.gemm.mr = 4;
327 xnn_params.qs8.gemm.nr = 8;
328 xnn_params.qs8.gemm.log2_kr = 2;
329 break;
330 }
Frank Barchard95198162021-12-21 17:29:10 -0800331 } else {
Frank Barchard1c852c92021-12-23 13:10:20 -0800332 switch (cpuinfo_get_uarch(0)->uarch) {
Frank Barcharda312e9a2022-02-02 11:27:50 -0800333 case cpuinfo_uarch_cortex_a7:
Frank Barchard2991acf2022-02-02 20:34:57 -0800334 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
335 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
336 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
337 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
338 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
339 xnn_params.qs8.gemm.mr = 4;
340 xnn_params.qs8.gemm.nr = 8;
341 break;
342 case cpuinfo_uarch_cortex_a35:
343 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7);
344 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
345 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
346 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
347 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
348 xnn_params.qs8.gemm.mr = 4;
349 xnn_params.qs8.gemm.nr = 8;
350 break;
Frank Barchard1c852c92021-12-23 13:10:20 -0800351 case cpuinfo_uarch_cortex_a53:
Frank Barchard77a3b5f2022-02-02 00:37:10 -0800352 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53);
353 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
354 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
355 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
356 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
357 xnn_params.qs8.gemm.mr = 4;
358 xnn_params.qs8.gemm.nr = 8;
359 break;
Frank Barcharda312e9a2022-02-02 11:27:50 -0800360 case cpuinfo_uarch_cortex_a55r0:
361 case cpuinfo_uarch_kryo:
362 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53);
363 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
364 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
365 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
366 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
367 xnn_params.qs8.gemm.mr = 4;
368 xnn_params.qs8.gemm.nr = 8;
369 break;
Frank Barchard1c852c92021-12-23 13:10:20 -0800370 case cpuinfo_uarch_cortex_a72:
371 case cpuinfo_uarch_exynos_m1:
372 case cpuinfo_uarch_exynos_m2:
373 case cpuinfo_uarch_exynos_m3:
Frank Barchard1c852c92021-12-23 13:10:20 -0800374 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
375 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
376 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
377 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
378 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
379 xnn_params.qs8.gemm.mr = 4;
380 xnn_params.qs8.gemm.nr = 8;
381 break;
382 default:
383 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
384 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
385 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
386 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
387 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
388 xnn_params.qs8.gemm.mr = 4;
389 xnn_params.qs8.gemm.nr = 8;
390 break;
391 }
Frank Barchard95198162021-12-21 17:29:10 -0800392 }
Frank Barchard364598a2022-01-24 20:39:26 -0800393 #if XNN_MAX_UARCH_TYPES > 1
394 {
395 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
Frank Barchardba5091f2022-01-25 13:31:26 -0800396 const uint32_t mr = xnn_params.qs8.gemm.mr;
397 const uint32_t nr = xnn_params.qs8.gemm.nr;
398 const uint32_t log2_kr = xnn_params.qs8.gemm.log2_kr;
Frank Barchard364598a2022-01-24 20:39:26 -0800399 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
400 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
401 if (uarch_info == NULL) {
402 /* No more microarchitectures in the system */
403 break;
404 }
405
406 switch (uarch_info->uarch) {
Frank Barchardba5091f2022-01-25 13:31:26 -0800407 case cpuinfo_uarch_cortex_a55:
408 if (mr == 4 && nr == 8 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
409 xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55;
Frank Barchard6cc5b482022-01-26 17:01:41 -0800410 xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55;
Frank Barchardba5091f2022-01-25 13:31:26 -0800411 xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot;
412 xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot;
413 }
414 break;
Frank Barchard364598a2022-01-24 20:39:26 -0800415 case cpuinfo_uarch_cortex_a53:
Frank Barchard364598a2022-01-24 20:39:26 -0800416 if (mr == 4 && nr == 8 && log2_kr == 0) {
Frank Barchard77a3b5f2022-02-02 00:37:10 -0800417 xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53;
Frank Barchard364598a2022-01-24 20:39:26 -0800418 xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64;
419 xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
420 xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
421 }
422 break;
Frank Barcharda312e9a2022-02-02 11:27:50 -0800423 case cpuinfo_uarch_cortex_a55r0:
424 if (mr == 4 && nr == 8 && log2_kr == 0) {
425 xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53;
426 xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64;
427 xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
428 xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
429 }
430 break;
Frank Barchard364598a2022-01-24 20:39:26 -0800431 default:
432 break;
433 }
434 }
435 }
436 #endif // XNN_MAX_UARCH_TYPES > 1
Frank Barchard95198162021-12-21 17:29:10 -0800437 #else // XNN_ENABLE_ASSEMBLY
438 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
439 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
440 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
441 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
442 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
443 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
444 xnn_params.qs8.gemm.mr = 4;
445 xnn_params.qs8.gemm.nr = 8;
446 xnn_params.qs8.gemm.log2_kr = 2;
447 } else {
448 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
449 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
450 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
451 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
452 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
453 xnn_params.qs8.gemm.mr = 2;
454 xnn_params.qs8.gemm.nr = 8;
455 xnn_params.qs8.gemm.log2_kr = 1;
456 xnn_params.qs8.gemm.log2_sr = 2;
457 }
458 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhanf28cddf2020-08-10 14:05:02 -0700459
Frank Barchard0d065732021-08-31 00:01:40 -0700460 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld64;
Marat Dukhanbe18f5c2021-07-16 18:46:39 -0700461 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchard0d065732021-08-31 00:01:40 -0700462 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhanf28cddf2020-08-10 14:05:02 -0700463 xnn_params.qs8.dwconv[0].primary_tile = 9;
Marat Dukhan5f2939f2021-07-23 13:38:32 -0700464 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mla8_ld64;
Marat Dukhanbe18f5c2021-07-16 18:46:39 -0700465 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -0700466 xnn_params.qs8.dwconv[1].channel_tile = 8;
467 xnn_params.qs8.dwconv[1].primary_tile = 25;
Marat Dukhanf28cddf2020-08-10 14:05:02 -0700468
469 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan85755042022-01-13 01:46:05 -0800470 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
471 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
472 .init.qs8 = xnn_init_qs8_avgpool_minmax_rndnu_neon_params,
473 .update.qs8 = xnn_update_qs8_avgpool_minmax_rndnu_neon_params,
Marat Dukhanda382d12022-01-07 19:51:20 -0800474 .row_tile = 7,
475 .channel_tile = 8,
Marat Dukhanf28cddf2020-08-10 14:05:02 -0700476 };
Marat Dukhanff209482020-09-03 14:26:53 -0700477
478 xnn_params.qs8.vadd = (struct vbinary_parameters) {
Marat Dukhan01debd92021-07-29 18:14:21 -0700479 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16,
480 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16,
481 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16,
Marat Dukhan64287252021-09-07 16:20:03 -0700482 .init.qs8_addsub = xnn_init_qs8_add_minmax_neon_params,
Marat Dukhan01debd92021-07-29 18:14:21 -0700483 .element_tile = 16,
Marat Dukhanff209482020-09-03 14:26:53 -0700484 };
Marat Dukhan33a98fa2022-01-13 00:08:57 -0800485 xnn_params.qs8.vmul = (struct vbinary_parameters) {
486 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
487 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
488 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
489 .init.qs8_mul = xnn_init_qs8_mul_minmax_rndnu_neon_params,
490 .element_tile = 16,
491 };
Marat Dukhanf28cddf2020-08-10 14:05:02 -0700492 #endif // XNN_NO_QS8_OPERATORS
493
Frank Barchardb40ee632021-12-30 11:10:02 -0800494 /*************************** QU8 AArch32 micro-kernels ***************************/
Marat Dukhan08b7a972020-07-14 18:17:29 -0700495 #ifndef XNN_NO_QU8_OPERATORS
496 init_flags |= XNN_INIT_FLAG_QU8;
Marat Dukhan854fb6b2020-06-19 12:33:44 -0700497
Frank Barchard1d5c6162022-02-03 02:21:50 -0800498 #if XNN_ENABLE_ASSEMBLY
499 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
500 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
501 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
502 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
503 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
504 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
505 xnn_params.qu8.gemm.mr = 4;
506 xnn_params.qu8.gemm.nr = 8;
507 xnn_params.qu8.gemm.log2_kr = 2;
508 } else {
509 switch (cpuinfo_get_uarch(0)->uarch) {
510 case cpuinfo_uarch_cortex_a7:
511 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
512 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
513 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
514 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
515 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
516 xnn_params.qu8.gemm.mr = 4;
517 xnn_params.qu8.gemm.nr = 8;
518 break;
519 case cpuinfo_uarch_cortex_a35:
520 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7);
521 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
522 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
523 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
524 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
525 xnn_params.qu8.gemm.mr = 4;
526 xnn_params.qu8.gemm.nr = 8;
527 break;
528 case cpuinfo_uarch_cortex_a53:
529 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53);
530 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
531 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
532 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
533 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
534 xnn_params.qu8.gemm.mr = 4;
535 xnn_params.qu8.gemm.nr = 8;
536 break;
537 case cpuinfo_uarch_cortex_a55r0:
538 case cpuinfo_uarch_kryo:
539 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53);
540 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
541 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
542 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
543 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
544 xnn_params.qu8.gemm.mr = 4;
545 xnn_params.qu8.gemm.nr = 8;
546 break;
547 case cpuinfo_uarch_cortex_a72:
548 case cpuinfo_uarch_exynos_m1:
549 case cpuinfo_uarch_exynos_m2:
550 case cpuinfo_uarch_exynos_m3:
551 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
552 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
553 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
554 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
555 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
556 xnn_params.qu8.gemm.mr = 4;
557 xnn_params.qu8.gemm.nr = 8;
558 break;
559 default:
560 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
561 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
562 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
563 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
564 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
565 xnn_params.qu8.gemm.mr = 4;
566 xnn_params.qu8.gemm.nr = 8;
567 break;
568 }
569 }
570 #if XNN_MAX_UARCH_TYPES > 1
571 {
572 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
573 const uint32_t mr = xnn_params.qu8.gemm.mr;
574 const uint32_t nr = xnn_params.qu8.gemm.nr;
575 const uint32_t log2_kr = xnn_params.qu8.gemm.log2_kr;
576 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
577 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
578 if (uarch_info == NULL) {
579 /* No more microarchitectures in the system */
580 break;
581 }
582
583 switch (uarch_info->uarch) {
584 case cpuinfo_uarch_cortex_a53:
585 if (mr == 4 && nr == 8 && log2_kr == 0) {
586 xnn_params.qu8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53;
587 xnn_params.qu8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64;
588 xnn_params.qu8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
589 xnn_params.qu8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
590 }
591 break;
592 case cpuinfo_uarch_cortex_a55r0:
593 if (mr == 4 && nr == 8 && log2_kr == 0) {
594 xnn_params.qu8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53;
595 xnn_params.qu8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64;
596 xnn_params.qu8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
597 xnn_params.qu8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
598 }
599 break;
600 default:
601 break;
602 }
603 }
604 }
605 #endif // XNN_MAX_UARCH_TYPES > 1
606 #else // XNN_ENABLE_ASSEMBLY
607 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
608 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
609 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
610 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
611 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
612 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
613 xnn_params.qu8.gemm.mr = 4;
614 xnn_params.qu8.gemm.nr = 8;
615 xnn_params.qu8.gemm.log2_kr = 2;
616 } else {
617 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
618 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
619 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
620 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
621 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
622 xnn_params.qu8.gemm.mr = 2;
623 xnn_params.qu8.gemm.nr = 8;
624 xnn_params.qu8.gemm.log2_kr = 1;
625 xnn_params.qu8.gemm.log2_sr = 2;
626 }
627 #endif // XNN_ENABLE_ASSEMBLY
628
Frank Barchard354cbc62021-09-27 21:42:41 -0700629 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8;
Marat Dukhan73a899a2021-07-27 00:10:38 -0700630 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
Frank Barchard0d065732021-08-31 00:01:40 -0700631 xnn_params.qu8.dwconv[0].channel_tile = 16;
Marat Dukhan08b7a972020-07-14 18:17:29 -0700632 xnn_params.qu8.dwconv[0].primary_tile = 9;
Frank Barchard354cbc62021-09-27 21:42:41 -0700633 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8;
Marat Dukhan73a899a2021-07-27 00:10:38 -0700634 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
Marat Dukhan43b46ee2021-07-15 19:07:50 -0700635 xnn_params.qu8.dwconv[1].channel_tile = 8;
636 xnn_params.qu8.dwconv[1].primary_tile = 25;
637
Marat Dukhan08b7a972020-07-14 18:17:29 -0700638 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -0800639 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8,
640 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8,
Marat Dukhan3c949a32022-01-09 20:12:33 -0800641 .init.qu8 = xnn_init_qu8_avgpool_minmax_neon_params,
Marat Dukhanda382d12022-01-07 19:51:20 -0800642 .primary_tile = 9,
643 .incremental_tile = 8,
644 .channel_tile = 8,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700645 };
Marat Dukhan08b7a972020-07-14 18:17:29 -0700646 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan85755042022-01-13 01:46:05 -0800647 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
648 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
649 .init.qu8 = xnn_init_qu8_avgpool_minmax_rndnu_neon_params,
650 .update.qu8 = xnn_update_qu8_avgpool_minmax_rndnu_neon_params,
Marat Dukhanda382d12022-01-07 19:51:20 -0800651 .row_tile = 7,
652 .channel_tile = 8,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700653 };
Marat Dukhandb007cd2021-07-20 23:42:39 -0700654 xnn_params.qu8.vadd = (struct vbinary_parameters) {
Frank Barchard0a3093c2021-08-31 09:58:11 -0700655 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__neon_ld64_x16,
656 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16,
657 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16,
Marat Dukhan64287252021-09-07 16:20:03 -0700658 .init.qu8_addsub = xnn_init_qu8_add_minmax_neon_params,
Marat Dukhandb007cd2021-07-20 23:42:39 -0700659 .element_tile = 8,
660 };
Marat Dukhan33a98fa2022-01-13 00:08:57 -0800661 xnn_params.qu8.vmul = (struct vbinary_parameters) {
662 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
663 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
664 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
665 .init.qu8_mul = xnn_init_qu8_mul_minmax_rndnu_neon_params,
666 .element_tile = 16,
667 };
Marat Dukhan08b7a972020-07-14 18:17:29 -0700668 #endif // XNN_NO_QU8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -0700669
Frank Barchardb40ee632021-12-30 11:10:02 -0800670 /**************************** S8 AArch32 micro-kernels ****************************/
Marat Dukhan23147532021-08-16 07:26:56 -0700671 #ifndef XNN_NO_S8_OPERATORS
672 init_flags |= XNN_INIT_FLAG_S8;
673
Marat Dukhan61c0c9e2021-08-16 23:16:14 -0700674 xnn_params.s8.clamp = (struct vunary_parameters) {
675 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__neon_x64,
676 .init.s8_minmax = xnn_init_s8_minmax_neon_params,
677 .element_tile = 64,
678 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -0800679 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
680 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__neon_c8,
681 .pixel_tile = 1,
682 .channel_tile = 8,
683 };
Marat Dukhan23147532021-08-16 07:26:56 -0700684 xnn_params.s8.maxpool = (struct maxpool_parameters) {
685 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__neon_c16,
Marat Dukhandc5c1482021-08-16 09:03:15 -0700686 .init.s8 = xnn_init_s8_minmax_neon_params,
Marat Dukhan23147532021-08-16 07:26:56 -0700687 .mr = 9,
688 .qr = 8,
689 };
690 #endif // XNN_NO_S8_OPERATORS
691
Frank Barchardb40ee632021-12-30 11:10:02 -0800692 /**************************** U8 AArch32 micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -0700693 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -0700694 init_flags |= XNN_INIT_FLAG_U8;
695
Marat Dukhan94912792021-08-16 21:40:30 -0700696 xnn_params.u8.clamp = (struct vunary_parameters) {
697 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__neon_x64,
698 .init.u8_minmax = xnn_init_u8_minmax_neon_params,
699 .element_tile = 64,
700 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -0800701 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
702 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__neon_c8,
703 .pixel_tile = 1,
704 .channel_tile = 8,
705 };
Marat Dukhan3b745a42020-05-10 21:43:25 -0700706 xnn_params.u8.maxpool = (struct maxpool_parameters) {
707 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16,
Marat Dukhan2ea50a02021-08-16 12:59:19 -0700708 .init.u8 = xnn_init_u8_minmax_neon_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700709 .mr = 9,
710 .qr = 8,
711 };
Marat Dukhan3b745a42020-05-10 21:43:25 -0700712 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
713 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
714 #endif // XNN_NO_U8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -0700715
Frank Barchardb40ee632021-12-30 11:10:02 -0800716 /**************************** X8 AArch32 micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -0700717 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -0700718 init_flags |= XNN_INIT_FLAG_X8;
719
Marat Dukhand67539d2021-09-08 23:06:03 -0700720 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700721 xnn_params.x8.zip = (struct zip_parameters) {
722 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__neon,
723 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__neon,
724 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__neon,
725 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__neon,
726 };
727 #endif // XNN_NO_X8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -0700728
Frank Barchardb40ee632021-12-30 11:10:02 -0800729 /**************************** F32 AArch32 micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -0700730 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -0700731 init_flags |= XNN_INIT_FLAG_F32;
732
Marat Dukhan3b745a42020-05-10 21:43:25 -0700733 #if XNN_ENABLE_ASSEMBLY
734 switch (cpuinfo_get_uarch(0)->uarch) {
735 case cpuinfo_uarch_cortex_a5:
736 case cpuinfo_uarch_cortex_a7:
Frank Barchard490febe2020-07-16 18:42:17 -0700737 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7);
738 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7);
Marat Dukhan3b745a42020-05-10 21:43:25 -0700739 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
740 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700741 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700742 xnn_params.f32.gemm.mr = 4;
743 xnn_params.f32.gemm.nr = 8;
Marat Dukhan05702cf2020-03-26 15:41:33 -0700744 break;
Marat Dukhan05702cf2020-03-26 15:41:33 -0700745
Marat Dukhan3b745a42020-05-10 21:43:25 -0700746 case cpuinfo_uarch_cortex_a53:
747 case cpuinfo_uarch_cortex_a55r0:
748 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53);
749 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53);
750 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
751 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700752 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700753 xnn_params.f32.gemm.mr = 4;
754 xnn_params.f32.gemm.nr = 8;
755 break;
756
Frank Barchardf975ee02021-11-05 16:01:00 -0700757 case cpuinfo_uarch_cortex_a35:
Marat Dukhan3b745a42020-05-10 21:43:25 -0700758 case cpuinfo_uarch_cortex_a55:
759 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55);
760 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55);
761 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
762 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700763 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700764 xnn_params.f32.gemm.mr = 4;
765 xnn_params.f32.gemm.nr = 8;
766 break;
767
768 case cpuinfo_uarch_cortex_a57:
769 case cpuinfo_uarch_cortex_a72:
770 case cpuinfo_uarch_cortex_a73:
Frank Barchard78735862022-01-04 16:47:44 -0800771 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75);
772 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75);
Marat Dukhan3b745a42020-05-10 21:43:25 -0700773 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
774 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700775 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700776 xnn_params.f32.gemm.mr = 4;
777 xnn_params.f32.gemm.nr = 8;
778 break;
779
780 case cpuinfo_uarch_krait:
781 default:
782 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75);
783 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75);
784 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
785 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700786 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700787 xnn_params.f32.gemm.mr = 4;
788 xnn_params.f32.gemm.nr = 8;
789 break;
790 }
791 #if XNN_MAX_UARCH_TYPES > 1
792 {
793 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
794 const uint32_t mr = xnn_params.f32.gemm.mr;
795 const uint32_t nr = xnn_params.f32.gemm.nr;
796 const uint32_t log2_sr = xnn_params.f32.gemm.log2_sr;
797 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
798 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
799 if (uarch_info == NULL) {
800 /* No more microarchitectures in the system */
Marat Dukhan05702cf2020-03-26 15:41:33 -0700801 break;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700802 }
803
804 switch (uarch_info->uarch) {
805 case cpuinfo_uarch_cortex_a53:
806 case cpuinfo_uarch_cortex_a55r0:
807 if (mr == 4 && nr == 8 && log2_sr == 0) {
808 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53;
809 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53;
810 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
811 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
812 }
813 break;
814 case cpuinfo_uarch_cortex_a55:
815 if (mr == 4 && nr == 8 && log2_sr == 0) {
816 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55;
817 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55;
818 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
819 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
820 }
821 break;
822 default:
823 break;
824 }
Marat Dukhan05702cf2020-03-26 15:41:33 -0700825 }
826 }
Marat Dukhan3b745a42020-05-10 21:43:25 -0700827 #endif // XNN_MAX_UARCH_TYPES > 1
828 #else // XNN_ENABLE_ASSEMBLY
829 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128);
830 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128);
831 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
832 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700833 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700834 xnn_params.f32.gemm.mr = 4;
835 xnn_params.f32.gemm.nr = 8;
836 #endif // XNN_ENABLE_ASSEMBLY
837 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64);
838 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700839 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700840 xnn_params.f32.gemm2.mr = 4;
841 xnn_params.f32.gemm2.nr = 2;
842
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700843 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__neon;
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700844 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Frank Barcharddbe781b2021-10-18 10:29:52 -0700845 xnn_params.f32.dwconv[0].channel_tile = 8,
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700846 xnn_params.f32.dwconv[0].primary_tile = 3,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700847
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700848 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__neon;
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700849 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700850 xnn_params.f32.dwconv[1].channel_tile = 8,
851 xnn_params.f32.dwconv[1].primary_tile = 4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700852
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700853 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neon;
Marat Dukhan104ae5e2021-05-24 13:41:57 -0700854 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
Frank Barcharddbe781b2021-10-18 10:29:52 -0700855 xnn_params.f32.dwconv[2].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -0700856 xnn_params.f32.dwconv[2].primary_tile = 9;
857
858 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__neon_acc2;
859 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
860 xnn_params.f32.dwconv[3].channel_tile = 8;
861 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhan3b745a42020-05-10 21:43:25 -0700862
863 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -0800864 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__neon_c4,
865 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4,
866 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
867 .primary_tile = 9,
868 .incremental_tile = 8,
869 .channel_tile = 4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700870 };
871 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -0800872 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4,
873 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4,
874 .primary_tile = 9,
875 .incremental_tile = 8,
876 .channel_tile = 4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700877 };
878 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -0800879 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4,
880 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4,
881 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
882 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
883 .row_tile = 7,
884 .channel_tile = 4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700885 };
886 xnn_params.f32.maxpool = (struct maxpool_parameters) {
887 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4,
Marat Dukhan91ae1652021-08-15 19:19:49 -0700888 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700889 .mr = 9,
890 .qr = 8,
891 };
892 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -0700893 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__neon_c4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700894 .mr = 4,
895 };
896 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -0700897 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__neon_c4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700898 .mr = 9,
899 };
900 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -0700901 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__neon_c4,
Marat Dukhan3b745a42020-05-10 21:43:25 -0700902 .mr = 9,
903 .qr = 8,
904 };
905 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
906 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__neon_c8,
907 .pixel_tile = 1,
908 .channel_tile = 8,
909 };
Marat Dukhane5efb162021-12-31 10:26:13 -0800910 xnn_params.f32.abs = (struct vunary_parameters) {
911 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__neon_x8,
912 .element_tile = 8,
913 };
Marat Dukhan94912792021-08-16 21:40:30 -0700914 xnn_params.f32.clamp = (struct vunary_parameters) {
915 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__neon_x8,
916 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
917 .element_tile = 8,
918 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -0800919 if (cpuinfo_has_arm_neon_fma()) {
Marat Dukhan4a79ff22022-01-01 12:16:48 -0800920 xnn_params.f32.elu = (struct vunary_parameters) {
921 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neonfma_rr1_p6_x8,
922 .init.f32_elu = xnn_init_f32_elu_neonfma_rr1_p6_params,
923 .element_tile = 8,
924 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -0800925 } else {
Marat Dukhan4a79ff22022-01-01 12:16:48 -0800926 xnn_params.f32.elu = (struct vunary_parameters) {
927 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x8,
928 .init.f32_elu = xnn_init_f32_elu_neon_rr2_lut16_p3_params,
929 .element_tile = 8,
930 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -0800931 }
Marat Dukhan561d0682021-12-23 16:12:35 -0800932 xnn_params.f32.hswish = (struct vunary_parameters) {
933 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__neon_x16,
Marat Dukhan0d10cc72021-12-23 19:49:19 -0800934 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
Marat Dukhan561d0682021-12-23 16:12:35 -0800935 .element_tile = 16,
936 };
Marat Dukhan2894e992021-12-30 08:29:48 -0800937 xnn_params.f32.lrelu = (struct vunary_parameters) {
938 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__neon_x8,
939 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
940 .element_tile = 8,
941 };
Marat Dukhane5efb162021-12-31 10:26:13 -0800942 xnn_params.f32.neg = (struct vunary_parameters) {
943 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__neon_x8,
944 .element_tile = 8,
945 };
Marat Dukhan64e52512020-06-09 13:41:16 -0700946 if (cpuinfo_has_arm_neon_v8()) {
Marat Dukhan0e801372022-01-04 00:10:41 -0800947 xnn_params.f32.rndne = (struct vunary_parameters) {
948 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8,
949 .element_tile = 8,
950 };
951 xnn_params.f32.rndz = (struct vunary_parameters) {
952 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8,
953 .element_tile = 8,
954 };
955 xnn_params.f32.rndu = (struct vunary_parameters) {
956 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8,
957 .element_tile = 8,
958 };
959 xnn_params.f32.rndd = (struct vunary_parameters) {
960 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8,
961 .element_tile = 8,
962 };
Marat Dukhan64e52512020-06-09 13:41:16 -0700963 } else {
Marat Dukhan0e801372022-01-04 00:10:41 -0800964 xnn_params.f32.rndne = (struct vunary_parameters) {
965 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neon_x8,
966 .element_tile = 8,
967 };
968 xnn_params.f32.rndz = (struct vunary_parameters) {
969 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neon_x8,
970 .element_tile = 8,
971 };
972 xnn_params.f32.rndu = (struct vunary_parameters) {
973 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neon_x8,
974 .element_tile = 8,
975 };
976 xnn_params.f32.rndd = (struct vunary_parameters) {
977 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neon_x8,
978 .element_tile = 8,
979 };
Marat Dukhan64e52512020-06-09 13:41:16 -0700980 }
Marat Dukhance834ad2022-01-03 00:22:01 -0800981 xnn_params.f32.sigmoid = (struct vunary_parameters) {
982 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x8,
Marat Dukhanbbfc27d2022-01-03 13:47:00 -0800983 .init.f32_sigmoid = xnn_init_f32_sigmoid_neon_rr2_lut64_p2_params,
Marat Dukhance834ad2022-01-03 00:22:01 -0800984 .element_tile = 8,
985 };
Marat Dukhane5efb162021-12-31 10:26:13 -0800986 xnn_params.f32.sqr = (struct vunary_parameters) {
987 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__neon_x8,
988 .element_tile = 8,
989 };
Marat Dukhane72b2822021-12-30 14:46:58 -0800990 xnn_params.f32.sqrt = (struct vunary_parameters) {
991 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
992 .element_tile = 1,
993 };
Marat Dukhan3b745a42020-05-10 21:43:25 -0700994 xnn_params.f32.prelu = (struct prelu_parameters) {
995 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
996 .row_tile = 2,
997 .channel_tile = 8,
998 };
Marat Dukhan4a5c7712022-01-05 22:43:13 -0800999 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
1000 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x8,
1001 .init = xnn_init_f32_expminus_neon_rr2_lut64_p2_params,
1002 .element_tile = 8,
1003 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001004 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__neon;
1005 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07001006 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__neon_x8,
1007 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
1008 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08001009 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001010 .element_tile = 8,
1011 };
1012 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07001013 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
1014 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
1015 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
Marat Dukhanf6004972021-12-30 11:23:02 -08001016 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001017 .element_tile = 2,
1018 };
1019 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07001020 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
1021 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
1022 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001023 .element_tile = 8,
1024 };
1025 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07001026 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__neon_x8,
1027 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
1028 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001029 .element_tile = 8,
1030 };
1031 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07001032 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__neon_x8,
1033 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
1034 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08001035 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001036 .element_tile = 8,
1037 };
1038 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07001039 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__neon_x8,
1040 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__neon_x8,
1041 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__neon_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08001042 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001043 .element_tile = 8,
1044 };
Marat Dukhanf7399262020-06-05 10:58:44 -07001045 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07001046 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__neon_x8,
1047 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
1048 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
Marat Dukhanf7399262020-06-05 10:58:44 -07001049 .element_tile = 8,
1050 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001051 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07001052 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__neon_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07001053 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001054 .channel_tile = 4,
1055 .row_tile = 2,
1056 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -07001057 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -08001058 init_flags |= XNN_INIT_FLAG_CHW_OPT;
1059
Marat Dukhan3e913382020-12-07 13:36:08 -08001060 xnn_params.f32.spmm = (struct spmm_parameters) {
1061 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__neon,
1062 .mr = 32,
1063 .nr = 1,
1064 };
Marat Dukhanc7634882020-12-07 15:11:12 -08001065 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
1066 .ukernel_with_symm_padding =
1067 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2,
1068 .output_channel_tile = 4,
1069 .output_height_tile = 2,
1070 .output_width_tile = 2,
1071 };
Marat Dukhan3e913382020-12-07 13:36:08 -08001072 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
1073 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4,
Marat Dukhan3e913382020-12-07 13:36:08 -08001074 .output_width_tile = 4,
1075 .output_height_tile = 2,
1076 };
1077 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
1078 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4,
Marat Dukhan3e913382020-12-07 13:36:08 -08001079 .output_width_tile = 4,
1080 .output_height_tile = 1,
1081 };
1082 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
1083 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4,
Marat Dukhan3e913382020-12-07 13:36:08 -08001084 .output_width_tile = 4,
1085 .output_height_tile = 1,
1086 };
1087 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
1088 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4,
Marat Dukhan3e913382020-12-07 13:36:08 -08001089 .output_width_tile = 4,
1090 .output_height_tile = 1,
1091 };
1092 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
1093 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__neon_x4,
1094 .channel_tile = 4,
1095 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -07001096 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
Artsiom Ablavatski2202c812021-01-22 14:16:43 -08001097 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__neon_p8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07001098 .channel_tile = 1,
Artsiom Ablavatskib3ffd582021-03-31 13:00:08 -07001099 .pixel_tile = 8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07001100 };
1101 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan3b745a42020-05-10 21:43:25 -07001102 #endif // XNN_NO_F32_OPERATORS
1103
Frank Barchardb40ee632021-12-30 11:10:02 -08001104 /*************************** VCVT AArch32 micro-kernels ***************************/
Marat Dukhanaf2ba002021-10-24 14:21:41 -07001105 #ifndef XNN_NO_VCVT_OPERATORS
1106 init_flags |= XNN_INIT_FLAG_VCVT;
1107
1108 if (cpuinfo_has_arm_neon_fp16()) {
Marat Dukhan134f9842021-12-29 19:57:31 -08001109 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
1110 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neonfp16_x16,
1111 .element_tile = 16,
1112 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08001113 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
1114 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__neonfp16_x16,
1115 .element_tile = 16,
1116 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07001117 } else {
Marat Dukhan134f9842021-12-29 19:57:31 -08001118 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
1119 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neon_int16_x16,
1120 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_neon_params,
1121 .element_tile = 16,
1122 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08001123 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
1124 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__neon_x8,
1125 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_neon_params,
1126 .element_tile = 8,
1127 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07001128 }
Marat Dukhaned2d7762021-12-03 23:51:19 -08001129 if (cpuinfo_has_arm_neon_v8()) {
1130 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
1131 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neonv8_x32,
1132 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neonv8_params,
1133 .element_tile = 32,
1134 };
1135 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
1136 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neonv8_x32,
1137 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neonv8_params,
1138 .element_tile = 32,
1139 };
1140 } else {
1141 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
1142 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neon_x32,
1143 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neon_params,
1144 .element_tile = 32,
1145 };
1146 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
1147 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neon_x32,
1148 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neon_params,
1149 .element_tile = 32,
1150 };
1151 }
Marat Dukhanf92206b2021-12-10 17:02:07 -08001152 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
1153 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__neon_x32,
1154 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_neon_params,
1155 .element_tile = 32,
1156 };
1157 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
1158 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__neon_x32,
1159 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_neon_params,
1160 .element_tile = 32,
1161 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07001162 #endif // XNN_NO_VCVT_OPERATORS
1163
Frank Barchardb40ee632021-12-30 11:10:02 -08001164 /**************************** X32 AArch32 micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -07001165 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07001166 init_flags |= XNN_INIT_FLAG_X32;
1167
Marat Dukhan3b745a42020-05-10 21:43:25 -07001168 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__neon;
1169 xnn_params.x32.zip = (struct zip_parameters) {
1170 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__neon,
1171 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__neon,
1172 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__neon,
1173 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
1174 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08001175 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -08001176 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
1177 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08001178 .channel_tile = 1,
1179 .pixel_tile = 1,
1180 };
1181 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan3b745a42020-05-10 21:43:25 -07001182 #endif // XNN_NO_X32_OPERATORS
Marat Dukhan933051b2021-08-07 16:26:15 -07001183
Frank Barchardb40ee632021-12-30 11:10:02 -08001184 /**************************** XX AArch32 micro-kernels ****************************/
Marat Dukhan933051b2021-08-07 16:26:15 -07001185 #ifndef XNN_NO_XX_OPERATORS
1186 init_flags |= XNN_INIT_FLAG_XX;
1187
1188 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
1189 xnn_params.xx.fill = (struct fill_parameters) {
1190 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__neon_x64,
1191 .row_tile = 1,
1192 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -07001193 xnn_params.xx.pad = (struct pad_parameters) {
1194 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__neon,
1195 .row_tile = 1,
1196 };
Marat Dukhan933051b2021-08-07 16:26:15 -07001197 #endif // XNN_NO_XX_OPERATORS
1198
Marat Dukhan3b745a42020-05-10 21:43:25 -07001199 } else if (!XNN_PLATFORM_MOBILE) {
Marat Dukhan933051b2021-08-07 16:26:15 -07001200
Frank Barchardb40ee632021-12-30 11:10:02 -08001201 /*************************** QS8 AArch32 Pre-NEON micro-kernels ***************************/
Marat Dukhan66a3ca12021-08-06 18:24:19 -07001202 #ifndef XNN_NO_QS8_OPERATORS
1203 init_flags |= XNN_INIT_FLAG_QS8;
1204
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001205 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic);
1206 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_fmagic);
1207 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic);
1208 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic);
1209 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan66a3ca12021-08-06 18:24:19 -07001210 xnn_params.qs8.gemm.mr = 2;
1211 xnn_params.qs8.gemm.nr = 2;
1212
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001213 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_fmagic;
1214 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan66a3ca12021-08-06 18:24:19 -07001215 xnn_params.qs8.dwconv[0].channel_tile = 1;
1216 xnn_params.qs8.dwconv[0].primary_tile = 9;
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001217 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up1x25__scalar_fmagic;
1218 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan66a3ca12021-08-06 18:24:19 -07001219 xnn_params.qs8.dwconv[1].channel_tile = 1;
1220 xnn_params.qs8.dwconv[1].primary_tile = 25;
1221
1222 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan847ff5e2022-01-11 20:31:06 -08001223 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
1224 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
Marat Dukhan53f41062022-01-11 19:44:57 -08001225 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
1226 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08001227 .row_tile = 7,
1228 .channel_tile = 1,
Marat Dukhan66a3ca12021-08-06 18:24:19 -07001229 };
1230 xnn_params.qs8.vadd = (struct vbinary_parameters) {
1231 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x1,
1232 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x1,
1233 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x1,
Marat Dukhan64287252021-09-07 16:20:03 -07001234 .init.qs8_addsub = xnn_init_qs8_add_minmax_scalar_params,
Marat Dukhan66a3ca12021-08-06 18:24:19 -07001235 .element_tile = 1,
1236 };
1237 xnn_params.qs8.vmul = (struct vbinary_parameters) {
1238 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4,
1239 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
1240 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
1241 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params,
1242 .element_tile = 4,
1243 };
1244 #endif // XNN_NO_QS8_OPERATORS
1245
Frank Barchardb40ee632021-12-30 11:10:02 -08001246 /*************************** QU8 AArch32 Pre-NEON micro-kernels ***************************/
Marat Dukhan08b7a972020-07-14 18:17:29 -07001247 #ifndef XNN_NO_QU8_OPERATORS
1248 init_flags |= XNN_INIT_FLAG_QU8;
Marat Dukhan854fb6b2020-06-19 12:33:44 -07001249
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001250 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic);
1251 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_fmagic);
1252 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic);
1253 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic);
1254 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan08b7a972020-07-14 18:17:29 -07001255 xnn_params.qu8.gemm.mr = 2;
1256 xnn_params.qu8.gemm.nr = 2;
Marat Dukhan3b745a42020-05-10 21:43:25 -07001257
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001258 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up1x9__scalar_fmagic;
1259 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan08b7a972020-07-14 18:17:29 -07001260 xnn_params.qu8.dwconv[0].channel_tile = 1;
1261 xnn_params.qu8.dwconv[0].primary_tile = 9;
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001262 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up1x25__scalar_fmagic;
1263 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan43b46ee2021-07-15 19:07:50 -07001264 xnn_params.qu8.dwconv[1].channel_tile = 1;
1265 xnn_params.qu8.dwconv[1].primary_tile = 25;
Marat Dukhan3b745a42020-05-10 21:43:25 -07001266
Marat Dukhan08b7a972020-07-14 18:17:29 -07001267 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08001268 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
1269 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
1270 .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
1271 .primary_tile = 9,
1272 .incremental_tile = 8,
1273 .channel_tile = 1,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001274 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07001275 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhand1f53e42022-01-12 22:34:51 -08001276 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
1277 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
1278 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
1279 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08001280 .row_tile = 7,
1281 .channel_tile = 1,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001282 };
Marat Dukhandb007cd2021-07-20 23:42:39 -07001283 xnn_params.qu8.vadd = (struct vbinary_parameters) {
1284 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x1,
1285 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x1,
1286 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x1,
Marat Dukhan64287252021-09-07 16:20:03 -07001287 .init.qu8_addsub = xnn_init_qu8_add_minmax_scalar_params,
Marat Dukhandb007cd2021-07-20 23:42:39 -07001288 .element_tile = 1,
1289 };
Marat Dukhan3c5e6622021-08-06 00:38:05 -07001290 xnn_params.qu8.vmul = (struct vbinary_parameters) {
1291 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4,
1292 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
1293 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
1294 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params,
1295 .element_tile = 4,
1296 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07001297 #endif // XNN_NO_QU8_OPERATORS
Marat Dukhan3b745a42020-05-10 21:43:25 -07001298
Frank Barchardb40ee632021-12-30 11:10:02 -08001299 /**************************** S8 AArch32 Pre-NEON micro-kernels ****************************/
Marat Dukhan23147532021-08-16 07:26:56 -07001300 #ifndef XNN_NO_S8_OPERATORS
1301 init_flags |= XNN_INIT_FLAG_S8;
1302
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07001303 xnn_params.s8.clamp = (struct vunary_parameters) {
1304 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__scalar_x4,
Marat Dukhan3b9b4bc2021-08-24 00:23:07 -07001305 .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07001306 .element_tile = 4,
1307 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08001308 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
1309 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__scalar_c1,
1310 .pixel_tile = 1,
1311 .channel_tile = 1,
1312 };
Marat Dukhan23147532021-08-16 07:26:56 -07001313 xnn_params.s8.maxpool = (struct maxpool_parameters) {
1314 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
1315 .init.s8 = xnn_init_s8_minmax_scalar_params,
1316 .mr = 9,
1317 .qr = 8,
1318 };
1319 #endif // XNN_NO_S8_OPERATORS
1320
Frank Barchardb40ee632021-12-30 11:10:02 -08001321 /**************************** U8 AArch32 Pre-NEON micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -07001322 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07001323 init_flags |= XNN_INIT_FLAG_U8;
1324
Marat Dukhan94912792021-08-16 21:40:30 -07001325 xnn_params.u8.clamp = (struct vunary_parameters) {
1326 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
Marat Dukhan3b9b4bc2021-08-24 00:23:07 -07001327 .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
Marat Dukhan94912792021-08-16 21:40:30 -07001328 .element_tile = 4,
1329 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08001330 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
1331 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__scalar_c1,
1332 .pixel_tile = 1,
1333 .channel_tile = 1,
1334 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001335 xnn_params.u8.maxpool = (struct maxpool_parameters) {
1336 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -07001337 .init.u8 = xnn_init_u8_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001338 .mr = 9,
1339 .qr = 8,
1340 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001341 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
1342 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
1343 #endif // XNN_NO_U8_OPERATORS
1344
Frank Barchardb40ee632021-12-30 11:10:02 -08001345 /**************************** X8 AArch32 Pre-NEON micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -07001346 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07001347 init_flags |= XNN_INIT_FLAG_X8;
1348
Marat Dukhand67539d2021-09-08 23:06:03 -07001349 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
Marat Dukhan3b745a42020-05-10 21:43:25 -07001350 xnn_params.x8.zip = (struct zip_parameters) {
1351 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
1352 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
1353 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
1354 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
1355 };
1356 #endif // XNN_NO_X8_OPERATORS
1357
Frank Barchardb40ee632021-12-30 11:10:02 -08001358 /**************************** F32 AArch32 Pre-NEON micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -07001359 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07001360 init_flags |= XNN_INIT_FLAG_F32;
1361
Marat Dukhan3b745a42020-05-10 21:43:25 -07001362 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__scalar);
1363 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__scalar);
1364 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__scalar);
1365 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__scalar);
Marat Dukhan467f6362020-05-22 23:21:55 -07001366 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__scalar);
1367 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__scalar);
1368 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__scalar);
1369 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__scalar);
Marat Dukhan3b745a42020-05-10 21:43:25 -07001370 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__scalar);
1371 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__scalar);
1372 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar);
1373 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001374 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001375 xnn_params.f32.gemm.mr = 4;
Marat Dukhan3b745a42020-05-10 21:43:25 -07001376 xnn_params.f32.gemm.nr = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001377
Marat Dukhan3b745a42020-05-10 21:43:25 -07001378 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__scalar);
1379 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__scalar),
1380 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__scalar);
1381 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__scalar),
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001382 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -07001383 xnn_params.f32.gemm2.mr = 4;
1384 xnn_params.f32.gemm2.nr = 2;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001385
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07001386 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__scalar_acc2;
1387 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001388 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -07001389 xnn_params.f32.dwconv[0].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07001390 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001391
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07001392 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__scalar_acc2;
1393 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001394 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -07001395 xnn_params.f32.dwconv[1].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07001396 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07001397
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07001398 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__scalar_acc2;
1399 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07001400 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhan3b745a42020-05-10 21:43:25 -07001401 xnn_params.f32.dwconv[2].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07001402 xnn_params.f32.dwconv[2].primary_tile = 9;
1403
1404 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__scalar_acc2;
1405 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__scalar_acc2;
1406 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
1407 xnn_params.f32.dwconv[3].channel_tile = 1;
1408 xnn_params.f32.dwconv[3].primary_tile = 25;
XNNPACK Teamb455b122019-09-27 18:10:33 -07001409
Marat Dukhan3b745a42020-05-10 21:43:25 -07001410 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08001411 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__scalar_c1,
1412 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__scalar_c1,
1413 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
1414 .primary_tile = 9,
1415 .incremental_tile = 8,
1416 .channel_tile = 1,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001417 };
1418 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08001419 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__scalar_c1,
1420 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__scalar_c1,
1421 .primary_tile = 9,
1422 .incremental_tile = 8,
1423 .channel_tile = 1,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001424 };
1425 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08001426 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1,
1427 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1,
1428 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
1429 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
1430 .row_tile = 7,
1431 .channel_tile = 1,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001432 };
1433 xnn_params.f32.maxpool = (struct maxpool_parameters) {
1434 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__scalar_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -07001435 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001436 .mr = 9,
1437 .qr = 8,
1438 };
1439 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
1440 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
1441 .mr = 4,
1442 };
1443 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
1444 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
1445 .mr = 9,
1446 };
1447 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
1448 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
1449 .mr = 9,
1450 .qr = 8,
1451 };
1452 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
1453 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
1454 .pixel_tile = 1,
1455 .channel_tile = 2,
1456 };
Marat Dukhane5efb162021-12-31 10:26:13 -08001457 xnn_params.f32.abs = (struct vunary_parameters) {
1458 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4,
1459 .element_tile = 4,
1460 };
Marat Dukhan94912792021-08-16 21:40:30 -07001461 xnn_params.f32.clamp = (struct vunary_parameters) {
1462 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__scalar_x4,
1463 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1464 .element_tile = 4,
1465 };
Marat Dukhan4a79ff22022-01-01 12:16:48 -08001466 xnn_params.f32.elu = (struct vunary_parameters) {
1467 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4,
1468 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
1469 .element_tile = 4,
1470 };
Marat Dukhan561d0682021-12-23 16:12:35 -08001471 xnn_params.f32.hswish = (struct vunary_parameters) {
1472 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08001473 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08001474 .element_tile = 4,
1475 };
Marat Dukhan2894e992021-12-30 08:29:48 -08001476 xnn_params.f32.lrelu = (struct vunary_parameters) {
1477 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4,
1478 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
1479 .element_tile = 4,
1480 };
Marat Dukhane5efb162021-12-31 10:26:13 -08001481 xnn_params.f32.neg = (struct vunary_parameters) {
1482 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4,
1483 .element_tile = 4,
1484 };
Marat Dukhan0e801372022-01-04 00:10:41 -08001485 xnn_params.f32.rndne = (struct vunary_parameters) {
1486 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x1,
1487 .element_tile = 1,
1488 };
1489 xnn_params.f32.rndz = (struct vunary_parameters) {
1490 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x1,
1491 .element_tile = 1,
1492 };
1493 xnn_params.f32.rndu = (struct vunary_parameters) {
1494 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x1,
1495 .element_tile = 1,
1496 };
1497 xnn_params.f32.rndd = (struct vunary_parameters) {
1498 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x1,
1499 .element_tile = 1,
1500 };
Marat Dukhance834ad2022-01-03 00:22:01 -08001501 xnn_params.f32.sigmoid = (struct vunary_parameters) {
1502 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
1503 .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
1504 .element_tile = 2,
1505 };
Marat Dukhane5efb162021-12-31 10:26:13 -08001506 xnn_params.f32.sqr = (struct vunary_parameters) {
1507 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4,
1508 .element_tile = 4,
1509 };
Marat Dukhane72b2822021-12-30 14:46:58 -08001510 xnn_params.f32.sqrt = (struct vunary_parameters) {
1511 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
1512 .element_tile = 1,
1513 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001514 xnn_params.f32.prelu = (struct prelu_parameters) {
1515 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
1516 .row_tile = 4,
1517 .channel_tile = 4,
1518 };
Marat Dukhan4a5c7712022-01-05 22:43:13 -08001519 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
1520 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
1521 .init = xnn_init_f32_expminus_scalar_rr2_p5_params,
1522 .element_tile = 4,
1523 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001524 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
1525 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07001526 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__scalar_x8,
1527 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
1528 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08001529 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001530 .element_tile = 8,
1531 };
1532 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07001533 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
1534 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
1535 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
Marat Dukhanf6004972021-12-30 11:23:02 -08001536 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001537 .element_tile = 2,
1538 };
1539 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07001540 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__scalar_x8,
1541 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
1542 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001543 .element_tile = 8,
1544 };
1545 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07001546 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__scalar_x8,
1547 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
1548 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001549 .element_tile = 8,
1550 };
1551 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07001552 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__scalar_x8,
1553 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
1554 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08001555 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001556 .element_tile = 8,
1557 };
1558 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07001559 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__scalar_x8,
1560 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__scalar_x8,
1561 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__scalar_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08001562 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001563 .element_tile = 8,
1564 };
Marat Dukhanf7399262020-06-05 10:58:44 -07001565 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07001566 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
1567 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
1568 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
Marat Dukhanf7399262020-06-05 10:58:44 -07001569 .element_tile = 8,
1570 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001571 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07001572 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__scalar_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07001573 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001574 .channel_tile = 1,
1575 .row_tile = 2,
1576 };
1577 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -08001578 init_flags |= XNN_INIT_FLAG_CHW_OPT;
1579
Marat Dukhan3b745a42020-05-10 21:43:25 -07001580 xnn_params.f32.spmm = (struct spmm_parameters) {
1581 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
1582 .mr = 8,
1583 .nr = 1,
1584 };
1585 xnn_params.f32.spmm2 = (struct spmm_parameters) {
1586 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
1587 .mr = 8,
1588 .nr = 2,
1589 };
1590 xnn_params.f32.spmm4 = (struct spmm_parameters) {
1591 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
1592 .mr = 8,
1593 .nr = 4,
1594 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07001595 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
Marat Dukhan3b745a42020-05-10 21:43:25 -07001596 .ukernel_with_symm_padding =
Marat Dukhan1f29b802020-05-15 23:46:39 -07001597 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001598 .output_channel_tile = 4,
1599 .output_height_tile = 1,
1600 .output_width_tile = 1,
1601 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07001602 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001603 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1,
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001604 .output_width_tile = 1,
1605 .output_height_tile = 4,
1606 };
1607 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
1608 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001609 .output_width_tile = 1,
Marat Dukhan91249d22020-10-24 12:02:51 -07001610 .output_height_tile = 2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001611 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07001612 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001613 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001614 .output_width_tile = 1,
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001615 .output_height_tile = 2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001616 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07001617 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001618 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001619 .output_width_tile = 1,
Marat Dukhan4ddfab42020-12-07 17:33:11 -08001620 .output_height_tile = 2,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001621 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07001622 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
1623 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
Marat Dukhan3b745a42020-05-10 21:43:25 -07001624 .channel_tile = 1,
1625 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -07001626 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
1627 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
1628 .channel_tile = 1,
1629 .pixel_tile = 4,
1630 };
Marat Dukhan3b745a42020-05-10 21:43:25 -07001631 #endif // XNN_NO_NCHW_OPERATORS
1632 #endif // XNN_NO_F32_OPERATORS
1633
Frank Barchardb40ee632021-12-30 11:10:02 -08001634 /*************************** VCVT AArch32 Pre-NEON micro-kernels ***************************/
Marat Dukhanaf2ba002021-10-24 14:21:41 -07001635 #ifndef XNN_NO_VCVT_OPERATORS
1636 init_flags |= XNN_INIT_FLAG_VCVT;
1637
Marat Dukhan134f9842021-12-29 19:57:31 -08001638 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
1639 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_x4,
1640 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
1641 .element_tile = 4,
1642 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08001643 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
1644 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_fabsf_x2,
1645 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_fabsf_params,
1646 .element_tile = 2,
1647 };
Marat Dukhaned2d7762021-12-03 23:51:19 -08001648 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
Marat Dukhanbdf10992022-01-04 09:20:14 -08001649 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__scalar_imagic_x4,
1650 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_imagic_params,
Marat Dukhaned2d7762021-12-03 23:51:19 -08001651 .element_tile = 4,
1652 };
1653 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
Marat Dukhanbdf10992022-01-04 09:20:14 -08001654 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x4,
1655 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_imagic_params,
Marat Dukhaned2d7762021-12-03 23:51:19 -08001656 .element_tile = 4,
1657 };
Marat Dukhanf92206b2021-12-10 17:02:07 -08001658 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
1659 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__scalar_x4,
1660 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
1661 .element_tile = 4,
1662 };
1663 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
1664 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__scalar_x4,
1665 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
1666 .element_tile = 4,
1667 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07001668 #endif // XNN_NO_VCVT_OPERATORS
1669
Frank Barchardb40ee632021-12-30 11:10:02 -08001670 /**************************** X32 AArch32 Pre-NEON micro-kernels ****************************/
Marat Dukhan3b745a42020-05-10 21:43:25 -07001671 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07001672 init_flags |= XNN_INIT_FLAG_X32;
1673
Marat Dukhan3b745a42020-05-10 21:43:25 -07001674 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
1675 xnn_params.x32.zip = (struct zip_parameters) {
1676 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
1677 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
1678 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
1679 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
1680 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08001681 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -08001682 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
1683 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08001684 .channel_tile = 1,
1685 .pixel_tile = 1,
1686 };
1687 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan3b745a42020-05-10 21:43:25 -07001688 #endif // XNN_NO_X32_OPERATORS
Marat Dukhan933051b2021-08-07 16:26:15 -07001689
Frank Barchardb40ee632021-12-30 11:10:02 -08001690 /**************************** XX AArch32 Pre-NEON micro-kernels ****************************/
Marat Dukhan933051b2021-08-07 16:26:15 -07001691 #ifndef XNN_NO_XX_OPERATORS
1692 init_flags |= XNN_INIT_FLAG_XX;
1693
1694 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
1695 xnn_params.xx.fill = (struct fill_parameters) {
1696 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
1697 .row_tile = 1,
1698 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -07001699 xnn_params.xx.pad = (struct pad_parameters) {
1700 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
1701 .row_tile = 1,
1702 };
Marat Dukhan933051b2021-08-07 16:26:15 -07001703 #endif // XNN_NO_XX_OPERATORS
Marat Dukhan3b745a42020-05-10 21:43:25 -07001704 }
XNNPACK Teamb455b122019-09-27 18:10:33 -07001705
Marat Dukhan1dadbf72019-10-01 10:46:20 -07001706#elif XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -07001707
Frank Barchardb40ee632021-12-30 11:10:02 -08001708 /**************************** QC8 AArch64 micro-kernels ****************************/
Marat Dukhan898d5852021-06-30 21:18:34 -07001709 #ifndef XNN_NO_QC8_OPERATORS
1710 init_flags |= XNN_INIT_FLAG_QC8;
1711
Marat Dukhan75d1b792021-07-01 13:00:28 -07001712 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
1713 #if XNN_ENABLE_ASSEMBLY
1714 if (cpuinfo_has_arm_neon_dot()) {
1715 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1716 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1717 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1718 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
Marat Dukhan7988a182021-12-06 22:00:33 -08001719 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001720 xnn_params.qc8.gemm.mr = 4;
1721 xnn_params.qc8.gemm.nr = 16;
1722 xnn_params.qc8.gemm.log2_kr = 2;
1723 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08001724 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1725 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1726 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
1727 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
Marat Dukhan7988a182021-12-06 22:00:33 -08001728 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001729 xnn_params.qc8.gemm.mr = 2;
1730 xnn_params.qc8.gemm.nr = 8;
1731 xnn_params.qc8.gemm.log2_kr = 3;
1732 }
1733 #else // !XNN_ENABLE_ASSEMBLY
1734 if (cpuinfo_has_arm_neon_dot()) {
1735 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__neondot);
1736 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1737 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__neondot);
1738 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
Marat Dukhan7988a182021-12-06 22:00:33 -08001739 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001740 xnn_params.qc8.gemm.mr = 4;
1741 xnn_params.qc8.gemm.nr = 16;
1742 xnn_params.qc8.gemm.log2_kr = 2;
1743 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08001744 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1745 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1746 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
1747 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
Marat Dukhan7988a182021-12-06 22:00:33 -08001748 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001749 xnn_params.qc8.gemm.mr = 2;
1750 xnn_params.qc8.gemm.nr = 8;
1751 xnn_params.qc8.gemm.log2_kr = 1;
Frank Barcharde7043ff2021-11-10 14:50:08 -08001752 xnn_params.qc8.gemm.log2_sr = 2;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001753 }
1754 #endif // XNN_ENABLE_ASSEMBLY
1755 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1756 #if XNN_ENABLE_ASSEMBLY
1757 if (cpuinfo_has_arm_neon_dot()) {
1758 switch (cpuinfo_get_core(0)->uarch) {
1759 case cpuinfo_uarch_cortex_a55:
1760 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1761 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1762 break;
1763 case cpuinfo_uarch_cortex_x1:
1764 case cpuinfo_uarch_cortex_a78:
1765 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1766 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1767 break;
1768 default:
1769 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64);
1770 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64);
1771 break;
1772 }
1773 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1774 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
Marat Dukhan7988a182021-12-06 22:00:33 -08001775 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001776 xnn_params.qc8.gemm.mr = 4;
1777 xnn_params.qc8.gemm.nr = 16;
1778 xnn_params.qc8.gemm.log2_kr = 2;
1779 } else {
1780 switch (cpuinfo_get_core(0)->uarch) {
Frank Barchard6c34dbf2021-11-22 16:14:53 -08001781 case cpuinfo_uarch_cortex_a35:
1782 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1783 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1784 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1785 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
Marat Dukhan7988a182021-12-06 22:00:33 -08001786 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Frank Barchard6c34dbf2021-11-22 16:14:53 -08001787 xnn_params.qc8.gemm.mr = 4;
1788 xnn_params.qc8.gemm.nr = 16;
1789 break;
1790
Marat Dukhan75d1b792021-07-01 13:00:28 -07001791 case cpuinfo_uarch_cortex_a53:
1792 case cpuinfo_uarch_cortex_a55r0:
1793 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1794 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1795 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1796 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
Marat Dukhan7988a182021-12-06 22:00:33 -08001797 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001798 xnn_params.qc8.gemm.mr = 4;
1799 xnn_params.qc8.gemm.nr = 16;
1800 break;
1801
1802 case cpuinfo_uarch_cortex_a72:
1803 case cpuinfo_uarch_cortex_a73:
1804 case cpuinfo_uarch_kryo:
Frank Barcharde22685a2021-11-12 11:36:58 -08001805 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1806 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1807 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm);
1808 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm);
Marat Dukhan7988a182021-12-06 22:00:33 -08001809 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001810 xnn_params.qc8.gemm.mr = 2;
1811 xnn_params.qc8.gemm.nr = 8;
1812 xnn_params.qc8.gemm.log2_kr = 3;
1813 break;
1814
1815 default:
Frank Barcharde22685a2021-11-12 11:36:58 -08001816 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1817 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1818 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
1819 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
Marat Dukhan7988a182021-12-06 22:00:33 -08001820 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001821 xnn_params.qc8.gemm.mr = 2;
1822 xnn_params.qc8.gemm.nr = 8;
1823 xnn_params.qc8.gemm.log2_kr = 3;
1824 break;
1825 }
1826 }
1827 #if XNN_MAX_UARCH_TYPES > 1
1828 {
1829 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
1830 const uint32_t mr = xnn_params.qc8.gemm.mr;
1831 const uint32_t nr = xnn_params.qc8.gemm.nr;
1832 const uint32_t log2_kr = xnn_params.qc8.gemm.log2_kr;
1833 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
1834 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
1835 if (uarch_info == NULL) {
1836 /* No more microarchitectures in the system */
1837 break;
1838 }
1839
1840 switch (uarch_info->uarch) {
1841 case cpuinfo_uarch_cortex_a53:
1842 case cpuinfo_uarch_cortex_a55r0:
1843 if (mr == 2 && nr == 8 && log2_kr == 3) {
Frank Barcharde22685a2021-11-12 11:36:58 -08001844 xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1845 xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1846 xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1847 xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001848 }
1849 break;
1850
1851 case cpuinfo_uarch_cortex_a55:
Frank Barchardc37b8da2021-09-01 00:35:19 -07001852 if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
Marat Dukhan75d1b792021-07-01 13:00:28 -07001853 xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55;
1854 xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55;
1855 xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot;
1856 xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot;
1857 }
1858 break;
1859 default:
1860 break;
1861 }
1862 }
1863 }
1864 #endif // XNN_MAX_UARCH_TYPES > 1
1865 #else // !XNN_ENABLE_ASSEMBLY
1866 if (cpuinfo_has_arm_neon_dot()) {
1867 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__neondot);
1868 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1869 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__neondot);
1870 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
Marat Dukhan7988a182021-12-06 22:00:33 -08001871 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001872 xnn_params.qc8.gemm.mr = 4;
1873 xnn_params.qc8.gemm.nr = 16;
1874 xnn_params.qc8.gemm.log2_kr = 2;
1875 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08001876 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1877 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1878 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
1879 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
Marat Dukhan7988a182021-12-06 22:00:33 -08001880 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001881 xnn_params.qc8.gemm.mr = 2;
1882 xnn_params.qc8.gemm.nr = 8;
1883 xnn_params.qc8.gemm.log2_kr = 1;
Frank Barcharde7043ff2021-11-10 14:50:08 -08001884 xnn_params.qc8.gemm.log2_sr = 2;
Marat Dukhan75d1b792021-07-01 13:00:28 -07001885 }
1886 #endif // XNN_ENABLE_ASSEMBLY
1887 #endif // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Marat Dukhan898d5852021-06-30 21:18:34 -07001888
Frank Barchard0d065732021-08-31 00:01:40 -07001889 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neonv8_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -08001890 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Frank Barchard0d065732021-08-31 00:01:40 -07001891 xnn_params.qc8.dwconv[0].channel_tile = 16;
Marat Dukhan898d5852021-06-30 21:18:34 -07001892 xnn_params.qc8.dwconv[0].primary_tile = 9;
Frank Barchard7da8b022021-08-31 09:49:10 -07001893 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__neonv8_mla8_ld64;
Marat Dukhan7988a182021-12-06 22:00:33 -08001894 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
Frank Barchard7da8b022021-08-31 09:49:10 -07001895 xnn_params.qc8.dwconv[1].channel_tile = 16;
Marat Dukhan898d5852021-06-30 21:18:34 -07001896 xnn_params.qc8.dwconv[1].primary_tile = 25;
1897 #endif // XNN_NO_QC8_OPERATORS
1898
Frank Barchardb40ee632021-12-30 11:10:02 -08001899 /**************************** QS8 AArch64 micro-kernels ****************************/
Marat Dukhanf28cddf2020-08-10 14:05:02 -07001900 #ifndef XNN_NO_QS8_OPERATORS
1901 init_flags |= XNN_INIT_FLAG_QS8;
1902
Marat Dukhandfe47b92020-12-14 02:48:43 -08001903 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Frank Barchardbc0c7292020-10-06 13:36:54 -07001904 #if XNN_ENABLE_ASSEMBLY
Marat Dukhan31677ad2020-10-13 23:59:31 -07001905 if (cpuinfo_has_arm_neon_dot()) {
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001906 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1907 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1908 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1909 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1910 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001911 xnn_params.qs8.gemm.mr = 4;
1912 xnn_params.qs8.gemm.nr = 16;
1913 xnn_params.qs8.gemm.log2_kr = 2;
1914 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08001915 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
1916 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
1917 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
1918 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001919 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001920 xnn_params.qs8.gemm.mr = 2;
Frank Barcharda414daa2021-02-23 15:50:39 -08001921 xnn_params.qs8.gemm.nr = 8;
Frank Barchardbbf51822021-03-12 10:37:31 -08001922 xnn_params.qs8.gemm.log2_kr = 3;
Frank Barchard1e8590e2020-10-12 21:20:46 -07001923 }
Marat Dukhan31677ad2020-10-13 23:59:31 -07001924 #else // !XNN_ENABLE_ASSEMBLY
1925 if (cpuinfo_has_arm_neon_dot()) {
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001926 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
1927 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1928 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
1929 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1930 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001931 xnn_params.qs8.gemm.mr = 4;
1932 xnn_params.qs8.gemm.nr = 16;
1933 xnn_params.qs8.gemm.log2_kr = 2;
1934 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08001935 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
1936 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
1937 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
1938 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001939 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001940 xnn_params.qs8.gemm.mr = 2;
Frank Barcharda414daa2021-02-23 15:50:39 -08001941 xnn_params.qs8.gemm.nr = 8;
1942 xnn_params.qs8.gemm.log2_kr = 1;
Frank Barchard66ae2572021-11-02 17:36:21 -07001943 xnn_params.qs8.gemm.log2_sr = 2;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001944 }
1945 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhandfe47b92020-12-14 02:48:43 -08001946 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
Marat Dukhan31677ad2020-10-13 23:59:31 -07001947 #if XNN_ENABLE_ASSEMBLY
1948 if (cpuinfo_has_arm_neon_dot()) {
1949 switch (cpuinfo_get_core(0)->uarch) {
1950 case cpuinfo_uarch_cortex_a55:
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001951 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1952 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
Marat Dukhan31677ad2020-10-13 23:59:31 -07001953 break;
Frank Barchard0ae35f22021-06-15 17:34:24 -07001954 case cpuinfo_uarch_cortex_x1:
1955 case cpuinfo_uarch_cortex_a78:
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001956 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1957 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
Frank Barchard0ae35f22021-06-15 17:34:24 -07001958 break;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001959 default:
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001960 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64);
1961 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64);
Marat Dukhan31677ad2020-10-13 23:59:31 -07001962 break;
1963 }
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001964 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1965 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1966 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07001967 xnn_params.qs8.gemm.mr = 4;
1968 xnn_params.qs8.gemm.nr = 16;
1969 xnn_params.qs8.gemm.log2_kr = 2;
1970 } else {
Frank Barchard2a995e72021-04-13 16:24:25 -07001971 switch (cpuinfo_get_core(0)->uarch) {
Frank Barchard6c34dbf2021-11-22 16:14:53 -08001972 case cpuinfo_uarch_cortex_a35:
1973 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1974 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1975 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1976 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1977 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
1978 xnn_params.qs8.gemm.mr = 4;
1979 xnn_params.qs8.gemm.nr = 16;
1980 break;
1981
Frank Barchard2a995e72021-04-13 16:24:25 -07001982 case cpuinfo_uarch_cortex_a53:
Frank Barchardfb5983d2021-04-20 14:09:08 -07001983 case cpuinfo_uarch_cortex_a55r0:
Frank Barchard22f9a9f2021-07-21 11:35:27 -07001984 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1985 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1986 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1987 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1988 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchardd4416d62021-05-17 15:51:37 -07001989 xnn_params.qs8.gemm.mr = 4;
1990 xnn_params.qs8.gemm.nr = 16;
Frank Barchard6ac1d182021-04-14 13:47:07 -07001991 break;
1992
Frank Barchard2a995e72021-04-13 16:24:25 -07001993 case cpuinfo_uarch_cortex_a72:
1994 case cpuinfo_uarch_cortex_a73:
1995 case cpuinfo_uarch_kryo:
Frank Barcharde22685a2021-11-12 11:36:58 -08001996 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1997 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1998 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm);
1999 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm);
Frank Barchard22f9a9f2021-07-21 11:35:27 -07002000 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchard2a995e72021-04-13 16:24:25 -07002001 xnn_params.qs8.gemm.mr = 2;
2002 xnn_params.qs8.gemm.nr = 8;
2003 xnn_params.qs8.gemm.log2_kr = 3;
Frank Barchardc77fc4c2021-04-14 13:28:01 -07002004 break;
Frank Barchard2a995e72021-04-13 16:24:25 -07002005
2006 default:
Frank Barcharde22685a2021-11-12 11:36:58 -08002007 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
2008 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
2009 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
2010 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
Frank Barchard22f9a9f2021-07-21 11:35:27 -07002011 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchard2a995e72021-04-13 16:24:25 -07002012 xnn_params.qs8.gemm.mr = 2;
2013 xnn_params.qs8.gemm.nr = 8;
2014 xnn_params.qs8.gemm.log2_kr = 3;
Frank Barchardc77fc4c2021-04-14 13:28:01 -07002015 break;
Frank Barchard2a995e72021-04-13 16:24:25 -07002016 }
Marat Dukhan31677ad2020-10-13 23:59:31 -07002017 }
2018 #if XNN_MAX_UARCH_TYPES > 1
2019 {
2020 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2021 const uint32_t mr = xnn_params.qs8.gemm.mr;
2022 const uint32_t nr = xnn_params.qs8.gemm.nr;
2023 const uint32_t log2_kr = xnn_params.qs8.gemm.log2_kr;
2024 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2025 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2026 if (uarch_info == NULL) {
2027 /* No more microarchitectures in the system */
2028 break;
2029 }
2030
2031 switch (uarch_info->uarch) {
Frank Barchard2a995e72021-04-13 16:24:25 -07002032 case cpuinfo_uarch_cortex_a53:
Frank Barchard90f520b2021-04-26 18:01:51 -07002033 case cpuinfo_uarch_cortex_a55r0:
Frank Barchard2a995e72021-04-13 16:24:25 -07002034 if (mr == 2 && nr == 8 && log2_kr == 3) {
Frank Barcharde22685a2021-11-12 11:36:58 -08002035 xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2036 xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2037 xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2038 xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
Frank Barchard2a995e72021-04-13 16:24:25 -07002039 }
2040 break;
2041
Marat Dukhan31677ad2020-10-13 23:59:31 -07002042 case cpuinfo_uarch_cortex_a55:
Frank Barchardc37b8da2021-09-01 00:35:19 -07002043 if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
Frank Barchard22f9a9f2021-07-21 11:35:27 -07002044 xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2045 xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2046 xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot;
2047 xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot;
Marat Dukhan31677ad2020-10-13 23:59:31 -07002048 }
2049 break;
2050 default:
2051 break;
2052 }
2053 }
2054 }
2055 #endif // XNN_MAX_UARCH_TYPES > 1
2056 #else // !XNN_ENABLE_ASSEMBLY
2057 if (cpuinfo_has_arm_neon_dot()) {
Frank Barchard22f9a9f2021-07-21 11:35:27 -07002058 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
2059 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2060 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
2061 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2062 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07002063 xnn_params.qs8.gemm.mr = 4;
2064 xnn_params.qs8.gemm.nr = 16;
2065 xnn_params.qs8.gemm.log2_kr = 2;
2066 } else {
Frank Barcharde22685a2021-11-12 11:36:58 -08002067 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
2068 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
2069 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
2070 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
Frank Barchard22f9a9f2021-07-21 11:35:27 -07002071 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Marat Dukhan31677ad2020-10-13 23:59:31 -07002072 xnn_params.qs8.gemm.mr = 2;
Frank Barcharda414daa2021-02-23 15:50:39 -08002073 xnn_params.qs8.gemm.nr = 8;
2074 xnn_params.qs8.gemm.log2_kr = 1;
Frank Barcharde7043ff2021-11-10 14:50:08 -08002075 xnn_params.qs8.gemm.log2_sr = 2;
Marat Dukhan31677ad2020-10-13 23:59:31 -07002076 }
2077 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhandfe47b92020-12-14 02:48:43 -08002078 #endif // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Marat Dukhanf28cddf2020-08-10 14:05:02 -07002079
Frank Barchard0d065732021-08-31 00:01:40 -07002080 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld64;
Marat Dukhan4ba70b72021-07-19 11:20:16 -07002081 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchard0d065732021-08-31 00:01:40 -07002082 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhanf28cddf2020-08-10 14:05:02 -07002083 xnn_params.qs8.dwconv[0].primary_tile = 9;
Frank Barchard7da8b022021-08-31 09:49:10 -07002084 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mla8_ld64;
Marat Dukhan4ba70b72021-07-19 11:20:16 -07002085 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
Frank Barchard7da8b022021-08-31 09:49:10 -07002086 xnn_params.qs8.dwconv[1].channel_tile = 16;
Marat Dukhan4ed14882021-05-12 17:50:40 -07002087 xnn_params.qs8.dwconv[1].primary_tile = 25;
Marat Dukhanf28cddf2020-08-10 14:05:02 -07002088
2089 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan85755042022-01-13 01:46:05 -08002090 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
2091 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
2092 .init.qs8 = xnn_init_qs8_avgpool_minmax_rndnu_neon_params,
2093 .update.qs8 = xnn_update_qs8_avgpool_minmax_rndnu_neon_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08002094 .row_tile = 7,
2095 .channel_tile = 8,
Marat Dukhanf28cddf2020-08-10 14:05:02 -07002096 };
Marat Dukhanff209482020-09-03 14:26:53 -07002097
2098 xnn_params.qs8.vadd = (struct vbinary_parameters) {
Marat Dukhan01debd92021-07-29 18:14:21 -07002099 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__neon_ld64_x32,
2100 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32,
2101 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32,
Marat Dukhan64287252021-09-07 16:20:03 -07002102 .init.qs8_addsub = xnn_init_qs8_add_minmax_neon_params,
Marat Dukhan01debd92021-07-29 18:14:21 -07002103 .element_tile = 32,
Marat Dukhanff209482020-09-03 14:26:53 -07002104 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07002105 xnn_params.qs8.vmul = (struct vbinary_parameters) {
Marat Dukhan33a98fa2022-01-13 00:08:57 -08002106 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
2107 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2108 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2109 .init.qs8_mul = xnn_init_qs8_mul_minmax_rndnu_neon_params,
Marat Dukhan0853b8a2021-08-03 01:01:53 -07002110 .element_tile = 16,
2111 };
Marat Dukhanf28cddf2020-08-10 14:05:02 -07002112 #endif // XNN_NO_QS8_OPERATORS
2113
Frank Barchardb40ee632021-12-30 11:10:02 -08002114 /**************************** QU8 AArch64 micro-kernels ****************************/
Marat Dukhan08b7a972020-07-14 18:17:29 -07002115 #ifndef XNN_NO_QU8_OPERATORS
2116 init_flags |= XNN_INIT_FLAG_QU8;
Frank Barchard20255152021-08-11 14:01:45 -07002117
Frank Barcharda962f1e2021-08-02 13:52:15 -07002118 #if XNN_ENABLE_ASSEMBLY
Frank Barchard20255152021-08-11 14:01:45 -07002119 if (cpuinfo_has_arm_neon_dot()) {
Frank Barchard8b698022021-08-26 11:17:32 -07002120 switch (cpuinfo_get_core(0)->uarch) {
2121 case cpuinfo_uarch_cortex_a55:
Frank Barcharda49e41f2021-08-31 20:30:24 -07002122 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
2123 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
2124 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2125 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
Frank Barchard8b698022021-08-26 11:17:32 -07002126 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2127 xnn_params.qu8.gemm.mr = 4;
Frank Barcharda49e41f2021-08-31 20:30:24 -07002128 xnn_params.qu8.gemm.nr = 16;
Frank Barchard8b698022021-08-26 11:17:32 -07002129 xnn_params.qu8.gemm.log2_kr = 2;
2130 break;
2131 default:
2132 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
2133 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
2134 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2135 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2136 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2137 xnn_params.qu8.gemm.mr = 4;
2138 xnn_params.qu8.gemm.nr = 16;
2139 xnn_params.qu8.gemm.log2_kr = 2;
2140 break;
2141 }
Frank Barchard20255152021-08-11 14:01:45 -07002142 } else {
2143 switch (cpuinfo_get_core(0)->uarch) {
2144 case cpuinfo_uarch_cortex_a53:
2145 case cpuinfo_uarch_cortex_a55r0:
Frank Barchard20255152021-08-11 14:01:45 -07002146 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53);
2147 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53);
2148 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2149 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2150 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2151 xnn_params.qu8.gemm.mr = 4;
2152 xnn_params.qu8.gemm.nr = 16;
2153 break;
Frank Barchardf479a1c2021-08-03 10:20:30 -07002154
Frank Barchard20255152021-08-11 14:01:45 -07002155 case cpuinfo_uarch_cortex_a57:
2156 case cpuinfo_uarch_cortex_a72:
2157 case cpuinfo_uarch_cortex_a73:
2158 case cpuinfo_uarch_cortex_a75:
2159 case cpuinfo_uarch_cortex_a76:
2160 case cpuinfo_uarch_exynos_m1:
2161 case cpuinfo_uarch_exynos_m2:
2162 case cpuinfo_uarch_exynos_m3:
2163 case cpuinfo_uarch_exynos_m4:
2164 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75);
2165 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75);
2166 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2167 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2168 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2169 xnn_params.qu8.gemm.mr = 4;
2170 xnn_params.qu8.gemm.nr = 16;
2171 break;
Frank Barchardf479a1c2021-08-03 10:20:30 -07002172
Frank Barchard20255152021-08-11 14:01:45 -07002173 case cpuinfo_uarch_kryo:
2174 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
2175 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
2176 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2177 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2178 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2179 xnn_params.qu8.gemm.mr = 4;
2180 xnn_params.qu8.gemm.nr = 16;
2181 break;
2182
2183 default:
2184 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75);
2185 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75);
2186 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2187 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2188 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2189 xnn_params.qu8.gemm.mr = 4;
2190 xnn_params.qu8.gemm.nr = 16;
2191 break;
2192 }
Frank Barchardf479a1c2021-08-03 10:20:30 -07002193 }
Frank Barchardc37b8da2021-09-01 00:35:19 -07002194 #if XNN_MAX_UARCH_TYPES > 1
2195 {
2196 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2197 const uint32_t mr = xnn_params.qu8.gemm.mr;
2198 const uint32_t nr = xnn_params.qu8.gemm.nr;
2199 const uint32_t log2_kr = xnn_params.qu8.gemm.log2_kr;
2200 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2201 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2202 if (uarch_info == NULL) {
2203 /* No more microarchitectures in the system */
2204 break;
2205 }
2206
2207 switch (uarch_info->uarch) {
2208 case cpuinfo_uarch_cortex_a53:
2209 case cpuinfo_uarch_cortex_a55r0:
2210 if (mr == 4 && nr == 16 && log2_kr == 0) {
2211 xnn_params.qu8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53;
2212 xnn_params.qu8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53;
2213 }
2214 break;
2215
2216 case cpuinfo_uarch_cortex_a55:
2217 if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
2218 xnn_params.qu8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2219 xnn_params.qu8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2220 }
2221 break;
2222 default:
2223 break;
2224 }
2225 }
2226 }
2227 #endif // XNN_MAX_UARCH_TYPES > 1
Frank Barchard20255152021-08-11 14:01:45 -07002228 #else // !XNN_ENABLE_ASSEMBLY
2229 if (cpuinfo_has_arm_neon_dot()) {
2230 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
2231 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
2232 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2233 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2234 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2235 xnn_params.qu8.gemm.mr = 4;
2236 xnn_params.qu8.gemm.nr = 16;
2237 xnn_params.qu8.gemm.log2_kr = 2;
2238 } else {
2239 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
2240 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
2241 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2242 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2243 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2244 xnn_params.qu8.gemm.mr = 4;
2245 xnn_params.qu8.gemm.nr = 16;
Marat Dukhan947805b2021-12-07 14:32:09 -08002246 }
Frank Barchard20255152021-08-11 14:01:45 -07002247 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhanaefaef32020-04-09 07:09:34 -07002248
Frank Barchard354cbc62021-09-27 21:42:41 -07002249 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8;
Marat Dukhan73a899a2021-07-27 00:10:38 -07002250 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
Frank Barchard354cbc62021-09-27 21:42:41 -07002251 xnn_params.qu8.dwconv[0].channel_tile = 16;
Marat Dukhan08b7a972020-07-14 18:17:29 -07002252 xnn_params.qu8.dwconv[0].primary_tile = 9;
Frank Barchard354cbc62021-09-27 21:42:41 -07002253 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8;
Marat Dukhan73a899a2021-07-27 00:10:38 -07002254 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
Frank Barchard354cbc62021-09-27 21:42:41 -07002255 xnn_params.qu8.dwconv[1].channel_tile = 8;
Marat Dukhan81721352021-07-15 18:26:08 -07002256 xnn_params.qu8.dwconv[1].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002257
Marat Dukhan08b7a972020-07-14 18:17:29 -07002258 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08002259 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8,
2260 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8,
Marat Dukhan3c949a32022-01-09 20:12:33 -08002261 .init.qu8 = xnn_init_qu8_avgpool_minmax_neon_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08002262 .primary_tile = 9,
2263 .incremental_tile = 8,
2264 .channel_tile = 8,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002265 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07002266 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan85755042022-01-13 01:46:05 -08002267 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
2268 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
2269 .init.qu8 = xnn_init_qu8_avgpool_minmax_rndnu_neon_params,
2270 .update.qu8 = xnn_update_qu8_avgpool_minmax_rndnu_neon_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08002271 .row_tile = 7,
2272 .channel_tile = 8,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002273 };
Marat Dukhandb007cd2021-07-20 23:42:39 -07002274 xnn_params.qu8.vadd = (struct vbinary_parameters) {
Frank Barchard0a3093c2021-08-31 09:58:11 -07002275 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__neon_ld64_x32,
2276 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x32,
2277 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x32,
Marat Dukhan64287252021-09-07 16:20:03 -07002278 .init.qu8_addsub = xnn_init_qu8_add_minmax_neon_params,
Marat Dukhandb007cd2021-07-20 23:42:39 -07002279 .element_tile = 8,
2280 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07002281 xnn_params.qu8.vmul = (struct vbinary_parameters) {
Marat Dukhan33a98fa2022-01-13 00:08:57 -08002282 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
2283 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2284 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2285 .init.qu8_mul = xnn_init_qu8_mul_minmax_rndnu_neon_params,
Marat Dukhan0853b8a2021-08-03 01:01:53 -07002286 .element_tile = 16,
2287 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07002288 #endif // XNN_NO_QU8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07002289
Frank Barchardb40ee632021-12-30 11:10:02 -08002290 /**************************** S8 AArch64 micro-kernels ****************************/
Marat Dukhan23147532021-08-16 07:26:56 -07002291 #ifndef XNN_NO_S8_OPERATORS
2292 init_flags |= XNN_INIT_FLAG_S8;
2293
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07002294 xnn_params.s8.clamp = (struct vunary_parameters) {
2295 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__neon_x64,
2296 .init.s8_minmax = xnn_init_s8_minmax_neon_params,
2297 .element_tile = 64,
2298 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08002299 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
2300 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__neon_c16,
2301 .pixel_tile = 1,
2302 .channel_tile = 16,
2303 };
Marat Dukhan23147532021-08-16 07:26:56 -07002304 xnn_params.s8.maxpool = (struct maxpool_parameters) {
2305 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__neon_c16,
Marat Dukhandc5c1482021-08-16 09:03:15 -07002306 .init.s8 = xnn_init_s8_minmax_neon_params,
Marat Dukhan23147532021-08-16 07:26:56 -07002307 .mr = 9,
2308 .qr = 8,
2309 };
2310 #endif // XNN_NO_S8_OPERATORS
2311
Frank Barchardb40ee632021-12-30 11:10:02 -08002312 /**************************** U8 AArch64 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002313 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07002314 init_flags |= XNN_INIT_FLAG_U8;
2315
Marat Dukhan94912792021-08-16 21:40:30 -07002316 xnn_params.u8.clamp = (struct vunary_parameters) {
2317 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__neon_x64,
2318 .init.u8_minmax = xnn_init_u8_minmax_neon_params,
2319 .element_tile = 64,
2320 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08002321 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
2322 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__neon_c16,
2323 .pixel_tile = 1,
2324 .channel_tile = 16,
2325 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002326 xnn_params.u8.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07002327 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16,
Marat Dukhan2ea50a02021-08-16 12:59:19 -07002328 .init.u8 = xnn_init_u8_minmax_neon_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002329 .mr = 9,
2330 .qr = 8,
2331 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002332 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
2333 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
2334 #endif // XNN_NO_U8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07002335
Frank Barchardb40ee632021-12-30 11:10:02 -08002336 /**************************** X8 AArch64 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002337 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07002338 init_flags |= XNN_INIT_FLAG_X8;
2339
Marat Dukhan98e054b2021-09-13 09:43:50 -07002340 xnn_params.x8.lut = xnn_x8_lut_ukernel__neon_tbx128x4_x64;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002341 xnn_params.x8.zip = (struct zip_parameters) {
2342 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__neon,
2343 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__neon,
2344 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__neon,
2345 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__neon,
2346 };
2347 #endif // XNN_NO_X8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07002348
Frank Barchardb40ee632021-12-30 11:10:02 -08002349 /**************************** F16 AArch64 micro-kernels ****************************/
Frank Barchard7e2cbb02020-06-12 01:22:13 -07002350 #ifndef XNN_NO_F16_OPERATORS
Marat Dukhan8d5d2592020-06-19 12:48:57 -07002351 if (cpuinfo_has_arm_neon_fp16_arith()) {
2352 init_flags |= XNN_INIT_FLAG_F16;
Frank Barchard7c3826e2021-06-07 15:14:16 -07002353 xnn_params.f16.gemm.mr = 6;
2354 xnn_params.f16.gemm.nr = 16;
Marat Dukhan854fb6b2020-06-19 12:33:44 -07002355
Frank Barchard6b73c4f2020-06-26 18:40:40 -07002356 #if XNN_ENABLE_ASSEMBLY
Frank Barchard7c3826e2021-06-07 15:14:16 -07002357 switch (cpuinfo_get_core(0)->uarch) {
2358 case cpuinfo_uarch_cortex_a55:
2359 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55);
2360 break;
2361
Frank Barchard07f4a892021-06-07 18:26:08 -07002362 case cpuinfo_uarch_cortex_a75:
Frank Barchard7b48ddc2021-06-11 13:00:49 -07002363 case cpuinfo_uarch_cortex_x1:
Frank Barchard07f4a892021-06-07 18:26:08 -07002364 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75);
2365 break;
2366
Frank Barchard7c3826e2021-06-07 15:14:16 -07002367 default:
2368 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
2369 break;
2370 }
Frank Barchard6b73c4f2020-06-26 18:40:40 -07002371 xnn_params.f16.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
Frank Barchard7c3826e2021-06-07 15:14:16 -07002372
2373 #if XNN_MAX_UARCH_TYPES > 1
2374 {
2375 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2376 const uint32_t mr = xnn_params.f16.gemm.mr;
2377 const uint32_t nr = xnn_params.f16.gemm.nr;
2378 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2379 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2380 if (uarch_info == NULL) {
2381 /* No more microarchitectures in the system */
2382 break;
2383 }
2384
2385 switch (uarch_info->uarch) {
2386 case cpuinfo_uarch_cortex_a55:
2387 if (mr == 6 && nr == 16) {
2388 xnn_params.f16.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55;
2389 }
2390 break;
Frank Barchard07f4a892021-06-07 18:26:08 -07002391
Frank Barchardd2f454e2021-06-08 10:47:16 -07002392 case cpuinfo_uarch_cortex_a55r0:
2393 if (mr == 6 && nr == 16) {
2394 xnn_params.f16.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64;
2395 }
2396 break;
2397
Frank Barchard07f4a892021-06-07 18:26:08 -07002398 /* Cortex A75 is the medium core Exynos 9820 (M4) */
2399 case cpuinfo_uarch_cortex_a75:
2400 if (mr == 6 && nr == 16) {
2401 xnn_params.f16.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75;
2402 }
2403 break;
2404
Frank Barchard7c3826e2021-06-07 15:14:16 -07002405 default:
2406 break;
2407 }
2408 }
2409 }
2410 #endif // XNN_MAX_UARCH_TYPES > 1
2411 #else // XNN_ENABLE_ASSEMBLY
Frank Barchard6b73c4f2020-06-26 18:40:40 -07002412 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2413 xnn_params.f16.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
Frank Barchard7c3826e2021-06-07 15:14:16 -07002414 #endif // XNN_ENABLE_ASSEMBLY
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002415 xnn_params.f16.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002416 xnn_params.f16.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
Marat Dukhanc4302c22022-01-06 19:27:03 -08002417 xnn_params.f16.gemm.init.f16 = xnn_init_f16_scaleminmax_neon_params;
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002418
2419 xnn_params.f16.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith;
Marat Dukhan645af972022-01-09 22:50:27 -08002420 xnn_params.f16.dwconv[0].init.f16 = xnn_init_f16_minmax_neon_params;
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002421 xnn_params.f16.dwconv[0].channel_tile = 16;
2422 xnn_params.f16.dwconv[0].primary_tile = 4;
2423
2424 xnn_params.f16.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith;
Marat Dukhan645af972022-01-09 22:50:27 -08002425 xnn_params.f16.dwconv[1].init.f16 = xnn_init_f16_minmax_neon_params;
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002426 xnn_params.f16.dwconv[1].channel_tile = 16;
2427 xnn_params.f16.dwconv[1].primary_tile = 9;
2428
2429 xnn_params.f16.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2;
Marat Dukhan645af972022-01-09 22:50:27 -08002430 xnn_params.f16.dwconv[2].init.f16 = xnn_init_f16_minmax_neon_params;
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002431 xnn_params.f16.dwconv[2].channel_tile = 8;
2432 xnn_params.f16.dwconv[2].primary_tile = 25;
2433
Marat Dukhan8d5d2592020-06-19 12:48:57 -07002434 xnn_params.f16.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08002435 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8,
2436 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8,
2437 .init.f16 = xnn_init_f16_scaleminmax_neon_params,
2438 .update.f16 = xnn_update_f16_scaleminmax_neon_params,
2439 .row_tile = 7,
2440 .channel_tile = 8,
Marat Dukhan8d5d2592020-06-19 12:48:57 -07002441 };
Marat Dukhan0a756b52022-02-03 23:08:50 -08002442
Marat Dukhan5756a922022-02-04 01:55:53 -08002443 xnn_params.f16.maxpool = (struct maxpool_parameters) {
2444 .ukernel = (xnn_maxpool_ukernel_function) xnn_f16_maxpool_minmax_ukernel_9p8x__neonfp16arith_c8,
2445 .init.f16 = xnn_init_f16_minmax_neon_params,
2446 .mr = 9,
2447 .qr = 8,
2448 };
2449
Marat Dukhan0a756b52022-02-03 23:08:50 -08002450 xnn_params.f16.prelu = (struct prelu_parameters) {
2451 .ukernel = (xnn_prelu_ukernel_function) xnn_f16_prelu_ukernel__neonfp16arith_2x16,
2452 .row_tile = 2,
2453 .channel_tile = 16,
2454 };
2455
Frank Barchard01898c02020-06-23 21:49:50 -07002456 xnn_params.f16.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002457 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vadd_minmax_ukernel__neonfp16arith_x16,
2458 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16,
2459 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16,
Marat Dukhan645af972022-01-09 22:50:27 -08002460 .init.f16_minmax = xnn_init_f16_minmax_neon_params,
Frank Barchard01898c02020-06-23 21:49:50 -07002461 .element_tile = 16,
2462 };
Frank Barchard0ea6a772020-09-09 15:26:31 -07002463 xnn_params.f16.vmul = (struct vbinary_parameters) {
2464 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmul_minmax_ukernel__neonfp16arith_x16,
2465 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16,
2466 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16,
Marat Dukhan645af972022-01-09 22:50:27 -08002467 .init.f16_minmax = xnn_init_f16_minmax_neon_params,
Frank Barchard0ea6a772020-09-09 15:26:31 -07002468 .element_tile = 16,
2469 };
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002470 xnn_params.f16.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07002471 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f16_vmulcaddc_minmax_ukernel_c8__neonfp16arith_2x,
Marat Dukhan645af972022-01-09 22:50:27 -08002472 .init.f16 = xnn_init_f16_minmax_neon_params,
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002473 .channel_tile = 8,
2474 .row_tile = 2,
2475 };
Marat Dukhan0a756b52022-02-03 23:08:50 -08002476
Marat Dukhan561d0682021-12-23 16:12:35 -08002477 xnn_params.f16.hswish = (struct vunary_parameters) {
2478 .ukernel = (xnn_univector_ukernel_function) xnn_f16_vhswish_ukernel__neonfp16arith_x16,
Marat Dukhan751f6222022-01-09 23:10:04 -08002479 .init.f16_hswish = xnn_init_f16_hswish_neon_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08002480 .element_tile = 16,
2481 };
Marat Dukhan8d5d2592020-06-19 12:48:57 -07002482 }
Frank Barchard7e2cbb02020-06-12 01:22:13 -07002483 #endif // XNN_NO_F16_OPERATORS
2484
Frank Barchardb40ee632021-12-30 11:10:02 -08002485 /**************************** F32 AArch64 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002486 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07002487 init_flags |= XNN_INIT_FLAG_F32;
2488
Marat Dukhandfe47b92020-12-14 02:48:43 -08002489 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Frank Barchard0d1052c2020-03-23 17:28:13 -07002490 #if XNN_ENABLE_ASSEMBLY
Frank Barchard143a1102021-06-15 09:15:34 -07002491 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2492 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2493 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2494 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002495 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002496 xnn_params.f32.gemm.mr = 6;
2497 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002498 #else // !XNN_ENABLE_ASSEMBLY
Marat Dukhanaefaef32020-04-09 07:09:34 -07002499 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64);
2500 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64);
2501 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2502 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002503 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002504 xnn_params.f32.gemm.mr = 6;
2505 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002506 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhandfe47b92020-12-14 02:48:43 -08002507 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
Frank Barchard0d1052c2020-03-23 17:28:13 -07002508 #if XNN_ENABLE_ASSEMBLY
2509 switch (cpuinfo_get_core(0)->uarch) {
2510 case cpuinfo_uarch_cortex_a57:
Frank Barchard143a1102021-06-15 09:15:34 -07002511 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75);
2512 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75);
2513 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
2514 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002515 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002516 xnn_params.f32.gemm.mr = 6;
2517 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002518 break;
2519 case cpuinfo_uarch_cortex_a72:
Frank Barchard143a1102021-06-15 09:15:34 -07002520 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75);
2521 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75);
2522 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2523 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002524 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002525 xnn_params.f32.gemm.mr = 4;
2526 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002527 break;
2528 case cpuinfo_uarch_cortex_a75:
2529 case cpuinfo_uarch_cortex_a76:
2530 case cpuinfo_uarch_exynos_m3:
2531 case cpuinfo_uarch_exynos_m4:
Frank Barchard143a1102021-06-15 09:15:34 -07002532 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2533 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2534 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2535 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002536 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002537 xnn_params.f32.gemm.mr = 6;
2538 xnn_params.f32.gemm.nr = 8;
Zhi An Nga63651c2022-02-01 16:16:33 -08002539 #if XNN_ENABLE_JIT
2540 xnn_params.f32.gemm.generator.gemm = xnn_init_hmp_gemm_codegen(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
Zhi An Nga3bf3ea2022-02-03 15:28:19 -08002541 xnn_params.f32.gemm.generator.igemm = xnn_init_hmp_igemm_codegen(xnn_generate_f32_igemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
Zhi An Ngeb7256b2022-02-03 16:02:54 -08002542 xnn_params.f32.gemm.generator.gemm1 = xnn_init_hmp_gemm_codegen(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
Zhi An Ngf30a8592022-02-03 16:49:19 -08002543 xnn_params.f32.gemm.generator.igemm1 = xnn_init_hmp_igemm_codegen(xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
Zhi An Nga63651c2022-02-01 16:16:33 -08002544 #endif
Frank Barchard0d1052c2020-03-23 17:28:13 -07002545 break;
2546 case cpuinfo_uarch_exynos_m1:
2547 case cpuinfo_uarch_exynos_m2:
Marat Dukhanaefaef32020-04-09 07:09:34 -07002548 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma);
2549 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma);
2550 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma);
2551 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002552 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002553 xnn_params.f32.gemm.mr = 6;
2554 xnn_params.f32.gemm.nr = 8;
2555 xnn_params.f32.gemm.log2_sr = 2;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002556 break;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002557 case cpuinfo_uarch_cortex_a53:
2558 case cpuinfo_uarch_cortex_a55r0:
Marat Dukhanaefaef32020-04-09 07:09:34 -07002559 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53);
2560 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53);
2561 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
2562 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002563 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002564 xnn_params.f32.gemm.mr = 6;
2565 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002566 break;
Frank Barchardf975ee02021-11-05 16:01:00 -07002567 case cpuinfo_uarch_cortex_a35:
Frank Barchard0d1052c2020-03-23 17:28:13 -07002568 case cpuinfo_uarch_cortex_a55:
Marat Dukhanaefaef32020-04-09 07:09:34 -07002569 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55);
2570 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55);
2571 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
2572 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002573 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002574 xnn_params.f32.gemm.mr = 6;
2575 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002576 break;
2577 case cpuinfo_uarch_cortex_a73:
Marat Dukhanaefaef32020-04-09 07:09:34 -07002578 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73);
2579 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73);
Frank Barchard143a1102021-06-15 09:15:34 -07002580 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2581 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002582 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002583 xnn_params.f32.gemm.mr = 6;
2584 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002585 break;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002586 case cpuinfo_uarch_cortex_a77:
2587 case cpuinfo_uarch_exynos_m5:
2588 case cpuinfo_uarch_kryo:
Frank Barchard143a1102021-06-15 09:15:34 -07002589 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75);
2590 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75);
2591 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
2592 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002593 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002594 xnn_params.f32.gemm.mr = 4;
2595 xnn_params.f32.gemm.nr = 8;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002596 break;
Frank Barchard990b2af2021-06-14 11:49:15 -07002597 case cpuinfo_uarch_cortex_a78:
2598 case cpuinfo_uarch_cortex_x1:
2599 default:
2600 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128);
Frank Barchard79cd5f92021-06-21 17:34:59 -07002601 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld128);
Frank Barchard990b2af2021-06-14 11:49:15 -07002602 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2603 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2604 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2605 xnn_params.f32.gemm.mr = 6;
2606 xnn_params.f32.gemm.nr = 8;
2607 break;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002608 }
Marat Dukhan05702cf2020-03-26 15:41:33 -07002609 #if XNN_MAX_UARCH_TYPES > 1
2610 {
2611 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2612 const uint32_t mr = xnn_params.f32.gemm.mr;
2613 const uint32_t nr = xnn_params.f32.gemm.nr;
2614 const uint32_t log2_sr = xnn_params.f32.gemm.log2_sr;
2615 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2616 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2617 if (uarch_info == NULL) {
2618 /* No more microarchitectures in the system */
2619 break;
2620 }
2621
2622 switch (uarch_info->uarch) {
2623 case cpuinfo_uarch_cortex_a53:
2624 case cpuinfo_uarch_cortex_a55r0:
2625 if (mr == 6 && nr == 8 && log2_sr == 0) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07002626 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53;
2627 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53;
2628 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2629 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
Marat Dukhan05702cf2020-03-26 15:41:33 -07002630 } else if (mr == 4 && nr == 8 && log2_sr == 0) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07002631 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53;
2632 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53;
2633 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2634 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
Marat Dukhan05702cf2020-03-26 15:41:33 -07002635 }
2636 break;
2637 case cpuinfo_uarch_cortex_a55:
2638 if (mr == 6 && nr == 8 && log2_sr == 0) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07002639 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55;
2640 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55;
2641 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2642 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
Marat Dukhan05702cf2020-03-26 15:41:33 -07002643 } else if (mr == 4 && nr == 8 && log2_sr == 0) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07002644 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55;
2645 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55;
2646 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2647 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
Marat Dukhan05702cf2020-03-26 15:41:33 -07002648 }
2649 break;
2650 default:
2651 break;
2652 }
2653 }
2654 }
2655 #endif // XNN_MAX_UARCH_TYPES > 1
Frank Barchard0d1052c2020-03-23 17:28:13 -07002656 #else // !XNN_ENABLE_ASSEMBLY
Marat Dukhanaefaef32020-04-09 07:09:34 -07002657 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64);
2658 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64);
2659 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2660 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002661 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002662 xnn_params.f32.gemm.mr = 6;
2663 xnn_params.f32.gemm.nr = 8;
Marat Dukhan31677ad2020-10-13 23:59:31 -07002664 #endif // XNN_ENABLE_ASSEMBLY
Marat Dukhandfe47b92020-12-14 02:48:43 -08002665 #endif // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Marat Dukhanaefaef32020-04-09 07:09:34 -07002666 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__neonfma_lane_ld64);
2667 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__neonfma_lane_ld64);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002668 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002669 xnn_params.f32.gemm2.mr = 4;
2670 xnn_params.f32.gemm2.nr = 2;
2671
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002672 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__neonfma;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07002673 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanf5425ea2020-04-24 01:46:00 -07002674 xnn_params.f32.dwconv[0].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002675 xnn_params.f32.dwconv[0].primary_tile = 3;
2676
2677 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__neonfma;
2678 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
2679 xnn_params.f32.dwconv[1].channel_tile = 8;
2680 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002681
Marat Dukhandfe47b92020-12-14 02:48:43 -08002682 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002683 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
2684 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2685 xnn_params.f32.dwconv[2].channel_tile = 8;
2686 xnn_params.f32.dwconv[2].primary_tile = 9;
Marat Dukhandfe47b92020-12-14 02:48:43 -08002687 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
Frank Barchard0d1052c2020-03-23 17:28:13 -07002688 switch (cpuinfo_get_core(0)->uarch) {
2689 case cpuinfo_uarch_kryo:
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002690 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
2691 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2692 xnn_params.f32.dwconv[2].channel_tile = 8;
2693 xnn_params.f32.dwconv[2].primary_tile = 9;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002694 break;
2695 #if XNN_ENABLE_ASSEMBLY
2696 case cpuinfo_uarch_cortex_a53:
2697 case cpuinfo_uarch_cortex_a55r0:
2698 case cpuinfo_uarch_cortex_a55:
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002699 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma_cortex_a55;
2700 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2701 xnn_params.f32.dwconv[2].channel_tile = 4;
2702 xnn_params.f32.dwconv[2].primary_tile = 9;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002703 break;
2704 #endif // XNN_ENABLE_ASSEMBLY
2705 default:
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002706 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
2707 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2708 xnn_params.f32.dwconv[2].channel_tile = 8;
2709 xnn_params.f32.dwconv[2].primary_tile = 9;
Frank Barchard0d1052c2020-03-23 17:28:13 -07002710 break;
2711 }
Marat Dukhandfe47b92020-12-14 02:48:43 -08002712 #endif // XNN_PLATFORM_IOS && XNN_PLATFORM_MAC
Marat Dukhanaefaef32020-04-09 07:09:34 -07002713
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07002714 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__neonfma_acc2;
2715 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
2716 xnn_params.f32.dwconv[3].channel_tile = 8;
2717 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07002718
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002719 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08002720 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__neon_c4,
2721 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4,
2722 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
2723 .primary_tile = 9,
2724 .incremental_tile = 8,
2725 .channel_tile = 4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002726 };
2727 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08002728 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4,
2729 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4,
2730 .primary_tile = 9,
2731 .incremental_tile = 8,
2732 .channel_tile = 4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002733 };
2734 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08002735 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4,
2736 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4,
2737 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
2738 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
2739 .row_tile = 7,
2740 .channel_tile = 4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002741 };
2742 xnn_params.f32.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07002743 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4,
Marat Dukhan91ae1652021-08-15 19:19:49 -07002744 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002745 .mr = 9,
2746 .qr = 8,
2747 };
2748 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -07002749 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__neon_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002750 .mr = 4,
2751 };
2752 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -07002753 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__neon_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002754 .mr = 9,
2755 };
2756 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
Marat Dukhanef25c6d2020-07-24 00:59:40 -07002757 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__neon_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002758 .mr = 9,
2759 .qr = 8,
2760 };
Marat Dukhan660fd192020-03-10 04:55:30 -07002761 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
2762 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__neonfma_c8,
Marat Dukhan69722492019-11-11 19:55:50 -08002763 .pixel_tile = 1,
2764 .channel_tile = 8,
2765 };
Marat Dukhane5efb162021-12-31 10:26:13 -08002766 xnn_params.f32.abs = (struct vunary_parameters) {
2767 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__neon_x8,
2768 .element_tile = 8,
2769 };
Marat Dukhan94912792021-08-16 21:40:30 -07002770 xnn_params.f32.clamp = (struct vunary_parameters) {
2771 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__neon_x8,
2772 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
2773 .element_tile = 8,
2774 };
Marat Dukhan4a79ff22022-01-01 12:16:48 -08002775 xnn_params.f32.elu = (struct vunary_parameters) {
2776 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x16,
2777 .init.f32_elu = xnn_init_f32_elu_neonfma_rr1_lut16_p3_params,
2778 .element_tile = 16,
2779 };
Marat Dukhan561d0682021-12-23 16:12:35 -08002780 xnn_params.f32.hswish = (struct vunary_parameters) {
2781 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__neon_x16,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08002782 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08002783 .element_tile = 16,
2784 };
Marat Dukhan2894e992021-12-30 08:29:48 -08002785 xnn_params.f32.lrelu = (struct vunary_parameters) {
2786 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__neon_x8,
2787 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
2788 .element_tile = 8,
2789 };
Marat Dukhane5efb162021-12-31 10:26:13 -08002790 xnn_params.f32.neg = (struct vunary_parameters) {
2791 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__neon_x8,
2792 .element_tile = 8,
2793 };
Marat Dukhan0e801372022-01-04 00:10:41 -08002794 xnn_params.f32.rndne = (struct vunary_parameters) {
2795 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8,
2796 .element_tile = 8,
2797 };
2798 xnn_params.f32.rndz = (struct vunary_parameters) {
2799 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8,
2800 .element_tile = 8,
2801 };
2802 xnn_params.f32.rndu = (struct vunary_parameters) {
2803 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8,
2804 .element_tile = 8,
2805 };
2806 xnn_params.f32.rndd = (struct vunary_parameters) {
2807 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8,
2808 .element_tile = 8,
2809 };
Marat Dukhance834ad2022-01-03 00:22:01 -08002810 xnn_params.f32.sigmoid = (struct vunary_parameters) {
2811 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x16,
2812 .init.f32_sigmoid = xnn_init_f32_sigmoid_neonfma_rr1_lut64_p2_params,
2813 .element_tile = 16,
2814 };
Marat Dukhane5efb162021-12-31 10:26:13 -08002815 xnn_params.f32.sqr = (struct vunary_parameters) {
2816 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__neon_x8,
2817 .element_tile = 8,
2818 };
Marat Dukhane72b2822021-12-30 14:46:58 -08002819 xnn_params.f32.sqrt = (struct vunary_parameters) {
2820 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__neon_sqrt_x4,
2821 .element_tile = 4,
2822 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002823 xnn_params.f32.prelu = (struct prelu_parameters) {
Marat Dukhan69c3f2c2019-11-06 12:30:01 -08002824 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
2825 .row_tile = 2,
2826 .channel_tile = 8,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002827 };
Marat Dukhan4a5c7712022-01-05 22:43:13 -08002828 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
2829 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x16,
2830 .init = xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params,
2831 .element_tile = 16,
2832 };
Marat Dukhan1edc4542020-01-27 12:40:13 -08002833 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__neon;
Marat Dukhanb1a0fc32019-12-02 19:32:02 -08002834 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002835 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__neon_x8,
2836 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
2837 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08002838 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhanb1a0fc32019-12-02 19:32:02 -08002839 .element_tile = 8,
2840 };
Marat Dukhan69180502019-12-06 15:00:31 -08002841 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002842 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__neon_x8,
2843 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__neon_x8,
2844 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__neon_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08002845 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan69180502019-12-06 15:00:31 -08002846 .element_tile = 8,
2847 };
Marat Dukhan79e7f842019-12-05 14:35:50 -08002848 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002849 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
2850 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
2851 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
Marat Dukhan79e7f842019-12-05 14:35:50 -08002852 .element_tile = 8,
2853 };
2854 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002855 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__neon_x8,
2856 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
2857 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
Marat Dukhan79e7f842019-12-05 14:35:50 -08002858 .element_tile = 8,
2859 };
Marat Dukhan1e782c42019-11-21 17:02:40 -08002860 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002861 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__neon_x8,
2862 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
2863 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08002864 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhanca2733c2019-11-15 23:21:17 -08002865 .element_tile = 8,
2866 };
Marat Dukhan05f3f6d2019-12-03 15:13:53 -08002867 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002868 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__neon_x8,
2869 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__neon_x8,
2870 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__neon_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08002871 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan05f3f6d2019-12-03 15:13:53 -08002872 .element_tile = 8,
2873 };
Marat Dukhanf7399262020-06-05 10:58:44 -07002874 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07002875 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__neon_x8,
2876 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
2877 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
Marat Dukhanf7399262020-06-05 10:58:44 -07002878 .element_tile = 8,
2879 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002880 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07002881 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07002882 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan49e6ee92019-11-06 15:55:29 -08002883 .channel_tile = 4,
2884 .row_tile = 2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002885 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08002886 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -08002887 init_flags |= XNN_INIT_FLAG_CHW_OPT;
2888
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002889 xnn_params.f32.spmm = (struct spmm_parameters) {
Marat Dukhan4baa2ac2021-02-11 10:04:09 -08002890 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__neonfma_pipelined,
2891 .mr = 32,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002892 .nr = 1,
XNNPACK Teamb455b122019-09-27 18:10:33 -07002893 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002894 xnn_params.f32.spmm2 = (struct spmm_parameters) {
Marat Dukhan4baa2ac2021-02-11 10:04:09 -08002895 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x2__neonfma,
2896 .mr = 32,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002897 .nr = 2,
2898 };
2899 xnn_params.f32.spmm4 = (struct spmm_parameters) {
Marat Dukhan4baa2ac2021-02-11 10:04:09 -08002900 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x4__neonfma,
2901 .mr = 32,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002902 .nr = 4,
2903 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07002904 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002905 .ukernel_with_symm_padding =
Marat Dukhan1f29b802020-05-15 23:46:39 -07002906 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002907 .output_channel_tile = 4,
2908 .output_height_tile = 2,
2909 .output_width_tile = 2,
2910 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07002911 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
2912 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002913 .output_width_tile = 4,
2914 .output_height_tile = 3,
2915 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07002916 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
Marat Dukhan82f0c322020-10-25 19:17:35 -07002917 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4_acc2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002918 .output_width_tile = 4,
Marat Dukhan82f0c322020-10-25 19:17:35 -07002919 .output_height_tile = 2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002920 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07002921 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
Marat Dukhan149f0ea2020-10-26 12:50:33 -07002922 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4,
Marat Dukhana99918a2019-11-15 14:40:12 -08002923 .output_width_tile = 4,
Marat Dukhan149f0ea2020-10-26 12:50:33 -07002924 .output_height_tile = 4,
Marat Dukhana99918a2019-11-15 14:40:12 -08002925 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07002926 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
2927 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2,
Marat Dukhana99918a2019-11-15 14:40:12 -08002928 .output_width_tile = 4,
2929 .output_height_tile = 1,
2930 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07002931 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
2932 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__neon_x4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002933 .channel_tile = 4,
2934 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -07002935 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
Artsiom Ablavatski2202c812021-01-22 14:16:43 -08002936 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__neonfma_p8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07002937 .channel_tile = 1,
Artsiom Ablavatskib3ffd582021-03-31 13:00:08 -07002938 .pixel_tile = 8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07002939 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08002940 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002941 #endif // XNN_NO_F32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07002942
Frank Barchardb40ee632021-12-30 11:10:02 -08002943 /*************************** VCVT AArch64 micro-kernels ***************************/
Marat Dukhanaf2ba002021-10-24 14:21:41 -07002944 #ifndef XNN_NO_VCVT_OPERATORS
2945 init_flags |= XNN_INIT_FLAG_VCVT;
2946
Marat Dukhan134f9842021-12-29 19:57:31 -08002947 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
2948 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neonfp16_x16,
2949 .element_tile = 16,
2950 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08002951 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
2952 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__neonfp16_x16,
2953 .element_tile = 16,
2954 };
Marat Dukhaned2d7762021-12-03 23:51:19 -08002955 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
2956 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neonv8_x32,
2957 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neonv8_params,
2958 .element_tile = 32,
2959 };
2960 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
2961 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neonv8_x32,
2962 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neonv8_params,
2963 .element_tile = 32,
2964 };
Marat Dukhanf92206b2021-12-10 17:02:07 -08002965 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
2966 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__neon_x32,
2967 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_neon_params,
2968 .element_tile = 32,
2969 };
2970 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
2971 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__neon_x32,
2972 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_neon_params,
2973 .element_tile = 32,
2974 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07002975 #endif // XNN_NO_VCVT_OPERATORS
2976
Frank Barchardb40ee632021-12-30 11:10:02 -08002977 /**************************** X32 AArch64 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002978 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07002979 init_flags |= XNN_INIT_FLAG_X32;
2980
Marat Dukhan57dccd82020-04-14 00:53:10 -07002981 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__neon;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002982 xnn_params.x32.zip = (struct zip_parameters) {
2983 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__neon,
2984 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__neon,
2985 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__neon,
2986 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
2987 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08002988 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -08002989 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
2990 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08002991 .channel_tile = 1,
2992 .pixel_tile = 1,
2993 };
2994 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07002995 #endif // XNN_NO_X32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07002996
Frank Barchardb40ee632021-12-30 11:10:02 -08002997 /**************************** XX AArch64 micro-kernels ****************************/
Marat Dukhan048931b2020-11-24 20:53:54 -08002998 #ifndef XNN_NO_XX_OPERATORS
2999 init_flags |= XNN_INIT_FLAG_XX;
3000
3001 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
Marat Dukhan933051b2021-08-07 16:26:15 -07003002 xnn_params.xx.fill = (struct fill_parameters) {
3003 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__neon_x64,
3004 .row_tile = 1,
3005 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -07003006 xnn_params.xx.pad = (struct pad_parameters) {
3007 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__neon,
3008 .row_tile = 1,
3009 };
Marat Dukhan048931b2020-11-24 20:53:54 -08003010 #endif
3011
Marat Dukhan933051b2021-08-07 16:26:15 -07003012#elif XNN_ARCH_X86 || XNN_ARCH_X86_64
3013 if (!cpuinfo_has_x86_sse2()) {
3014 xnn_log_error("XNNPACK initialization failed: SSE2 is not supported");
3015 return;
3016 }
3017
Frank Barchardb40ee632021-12-30 11:10:02 -08003018 /**************************** QC8 x86 micro-kernels ****************************/
Marat Dukhan5e353862021-06-15 09:03:25 -07003019 #ifndef XNN_NO_QC8_OPERATORS
3020 init_flags |= XNN_INIT_FLAG_QC8;
3021
Marat Dukhan039a3882022-01-21 14:53:11 -08003022 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhan5e353862021-06-15 09:03:25 -07003023 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3024 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3025 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3026 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3027 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_avx512_params;
3028 xnn_params.qc8.gemm.mr = 4;
3029 xnn_params.qc8.gemm.nr = 16;
3030 xnn_params.qc8.gemm.log2_kr = 3;
3031 } else if (cpuinfo_has_x86_xop()) {
3032 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3033 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3034 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3035 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3036 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3037 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse4_params;
3038 xnn_params.qc8.gemm.mr = 2;
3039 xnn_params.qc8.gemm.nr = 4;
3040 xnn_params.qc8.gemm.log2_kr = 3;
3041 } else if (cpuinfo_has_x86_avx2()) {
3042 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3043 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
3044 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3045 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3046 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_avx2_params;
3047 xnn_params.qc8.gemm.mr = 3;
3048 xnn_params.qc8.gemm.nr = 8;
3049 xnn_params.qc8.gemm.log2_kr = 3;
3050 } else if (cpuinfo_has_x86_avx()) {
3051 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3052 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3053 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3054 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3055 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse4_params;
3056 xnn_params.qc8.gemm.mr = 2;
3057 xnn_params.qc8.gemm.nr = 4;
3058 xnn_params.qc8.gemm.log2_kr = 3;
3059 } else if (cpuinfo_has_x86_sse4_1()) {
3060 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3061 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3062 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3063 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3064 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse4_params;
3065 xnn_params.qc8.gemm.mr = 3;
3066 xnn_params.qc8.gemm.nr = 4;
3067 xnn_params.qc8.gemm.log2_kr = 3;
3068 } else {
3069 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3070 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3071 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3072 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3073 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse2_params;
3074 xnn_params.qc8.gemm.mr = 3;
3075 xnn_params.qc8.gemm.nr = 4;
3076 xnn_params.qc8.gemm.log2_kr = 3;
3077 }
3078
Marat Dukhan039a3882022-01-21 14:53:11 -08003079 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhan5e353862021-06-15 09:03:25 -07003080 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
3081 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_avx512_params;
3082 xnn_params.qc8.dwconv[0].channel_tile = 32;
3083 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
3084 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_avx512_params;
3085 xnn_params.qc8.dwconv[1].channel_tile = 32;
3086 } else if (cpuinfo_has_x86_xop()) {
3087 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
Marat Dukhan28480592021-07-27 23:52:27 -07003088 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16;
Marat Dukhan5e353862021-06-15 09:03:25 -07003089 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3090 xnn_params.qc8.dwconv[0].channel_tile = 16;
Marat Dukhan28480592021-07-27 23:52:27 -07003091 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul16_add16;
Marat Dukhan5e353862021-06-15 09:03:25 -07003092 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3093 xnn_params.qc8.dwconv[1].channel_tile = 16;
3094 } else if (cpuinfo_has_x86_avx2()) {
3095 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
3096 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_avx2_params;
3097 xnn_params.qc8.dwconv[0].channel_tile = 16;
3098 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
3099 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_avx2_params;
3100 xnn_params.qc8.dwconv[1].channel_tile = 16;
3101 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan28480592021-07-27 23:52:27 -07003102 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16;
Marat Dukhan5e353862021-06-15 09:03:25 -07003103 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3104 xnn_params.qc8.dwconv[0].channel_tile = 16;
Marat Dukhan28480592021-07-27 23:52:27 -07003105 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16;
Marat Dukhan5e353862021-06-15 09:03:25 -07003106 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3107 xnn_params.qc8.dwconv[1].channel_tile = 16;
3108 } else if (cpuinfo_has_x86_sse4_1()) {
3109 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16;
3110 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3111 xnn_params.qc8.dwconv[0].channel_tile = 8;
3112 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16;
3113 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3114 xnn_params.qc8.dwconv[1].channel_tile = 8;
3115 } else if (cpuinfo_has_x86_sse2()) {
3116 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16;
3117 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse2_params;
3118 xnn_params.qc8.dwconv[0].channel_tile = 8;
3119 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16;
3120 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse2_params;
3121 xnn_params.qc8.dwconv[1].channel_tile = 8;
3122 }
3123 xnn_params.qc8.dwconv[0].primary_tile = 9;
3124 xnn_params.qc8.dwconv[1].primary_tile = 25;
3125 #endif // XNN_NO_QC8_OPERATORS
3126
Frank Barchardb40ee632021-12-30 11:10:02 -08003127 /**************************** QS8 x86 micro-kernels ****************************/
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07003128 #ifndef XNN_NO_QS8_OPERATORS
3129 init_flags |= XNN_INIT_FLAG_QS8;
3130
Marat Dukhan039a3882022-01-21 14:53:11 -08003131 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhan71855ee2021-05-25 19:05:06 -07003132 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3133 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3134 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3135 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3136 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
Marat Dukhanbb00b1d2020-08-10 11:37:23 -07003137 xnn_params.qs8.gemm.mr = 4;
3138 xnn_params.qs8.gemm.nr = 16;
3139 xnn_params.qs8.gemm.log2_kr = 3;
3140 } else if (cpuinfo_has_x86_xop()) {
Marat Dukhan75215d82020-08-07 23:08:03 -07003141 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
Marat Dukhanc46e6712021-06-01 19:00:16 -07003142 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3143 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3144 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3145 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3146 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan75215d82020-08-07 23:08:03 -07003147 xnn_params.qs8.gemm.mr = 2;
3148 xnn_params.qs8.gemm.nr = 4;
3149 xnn_params.qs8.gemm.log2_kr = 3;
3150 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan9b474cf2021-05-25 16:37:48 -07003151 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3152 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
3153 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3154 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3155 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07003156 xnn_params.qs8.gemm.mr = 3;
3157 xnn_params.qs8.gemm.nr = 8;
3158 xnn_params.qs8.gemm.log2_kr = 3;
Marat Dukhana3c16332021-04-02 15:03:27 -07003159 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhanc46e6712021-06-01 19:00:16 -07003160 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3161 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3162 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3163 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3164 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhana3c16332021-04-02 15:03:27 -07003165 xnn_params.qs8.gemm.mr = 2;
3166 xnn_params.qs8.gemm.nr = 4;
3167 xnn_params.qs8.gemm.log2_kr = 3;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07003168 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhanc46e6712021-06-01 19:00:16 -07003169 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3170 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3171 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3172 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3173 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07003174 xnn_params.qs8.gemm.mr = 3;
3175 xnn_params.qs8.gemm.nr = 4;
3176 xnn_params.qs8.gemm.log2_kr = 3;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07003177 } else {
Marat Dukhanc46e6712021-06-01 19:00:16 -07003178 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3179 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3180 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3181 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3182 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07003183 xnn_params.qs8.gemm.mr = 3;
3184 xnn_params.qs8.gemm.nr = 4;
3185 xnn_params.qs8.gemm.log2_kr = 3;
3186 }
3187
Marat Dukhan039a3882022-01-21 14:53:11 -08003188 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhan71855ee2021-05-25 19:05:06 -07003189 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
3190 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
Marat Dukhan2ffc5e62020-09-06 22:33:38 -07003191 xnn_params.qs8.dwconv[0].channel_tile = 32;
Marat Dukhan71855ee2021-05-25 19:05:06 -07003192 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
3193 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07003194 xnn_params.qs8.dwconv[1].channel_tile = 32;
Marat Dukhan3fd4e272021-04-10 11:16:42 -07003195 } else if (cpuinfo_has_x86_xop()) {
3196 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
Marat Dukhan02f06e32021-07-27 14:33:47 -07003197 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07003198 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan3fd4e272021-04-10 11:16:42 -07003199 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhan09668562021-07-26 16:52:20 -07003200 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07003201 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07003202 xnn_params.qs8.dwconv[1].channel_tile = 16;
Marat Dukhan2ffc5e62020-09-06 22:33:38 -07003203 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan9b474cf2021-05-25 16:37:48 -07003204 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
3205 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
Marat Dukhand65a1522020-08-04 19:28:18 -07003206 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhan9b474cf2021-05-25 16:37:48 -07003207 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
3208 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07003209 xnn_params.qs8.dwconv[1].channel_tile = 16;
Marat Dukhanfa0ab852021-04-02 17:30:49 -07003210 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan09668562021-07-26 16:52:20 -07003211 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07003212 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhanfa0ab852021-04-02 17:30:49 -07003213 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhan09668562021-07-26 16:52:20 -07003214 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07003215 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07003216 xnn_params.qs8.dwconv[1].channel_tile = 16;
Marat Dukhand65a1522020-08-04 19:28:18 -07003217 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhan09668562021-07-26 16:52:20 -07003218 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07003219 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhand65a1522020-08-04 19:28:18 -07003220 xnn_params.qs8.dwconv[0].channel_tile = 8;
Marat Dukhan09668562021-07-26 16:52:20 -07003221 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07003222 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07003223 xnn_params.qs8.dwconv[1].channel_tile = 8;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07003224 } else if (cpuinfo_has_x86_sse2()) {
Marat Dukhan09668562021-07-26 16:52:20 -07003225 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07003226 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
Marat Dukhand65a1522020-08-04 19:28:18 -07003227 xnn_params.qs8.dwconv[0].channel_tile = 8;
Marat Dukhan09668562021-07-26 16:52:20 -07003228 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16_add16;
Marat Dukhancaf48312021-06-01 20:20:58 -07003229 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07003230 xnn_params.qs8.dwconv[1].channel_tile = 8;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07003231 }
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07003232 xnn_params.qs8.dwconv[0].primary_tile = 9;
Marat Dukhan4ed14882021-05-12 17:50:40 -07003233 xnn_params.qs8.dwconv[1].primary_tile = 25;
Marat Dukhan9e0b5392020-08-07 02:29:34 -07003234
3235 if (cpuinfo_has_x86_sse4_1()) {
3236 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan9e258d62022-01-12 10:50:51 -08003237 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8,
3238 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8,
Marat Dukhan53f41062022-01-11 19:44:57 -08003239 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_sse4_params,
3240 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_sse4_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08003241 .row_tile = 7,
3242 .channel_tile = 8,
Marat Dukhan9e0b5392020-08-07 02:29:34 -07003243 };
Marat Dukhan53f41062022-01-11 19:44:57 -08003244 } else {
Marat Dukhan9e0b5392020-08-07 02:29:34 -07003245 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan9e258d62022-01-12 10:50:51 -08003246 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8,
3247 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8,
Marat Dukhan53f41062022-01-11 19:44:57 -08003248 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_sse2_params,
3249 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_sse2_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08003250 .row_tile = 7,
3251 .channel_tile = 8,
Marat Dukhan9e0b5392020-08-07 02:29:34 -07003252 };
3253 }
Marat Dukhanff209482020-09-03 14:26:53 -07003254
Marat Dukhan039a3882022-01-21 14:53:11 -08003255 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhane76049a2021-07-22 14:48:59 -07003256 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3257 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16,
3258 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
3259 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
Marat Dukhan64287252021-09-07 16:20:03 -07003260 .init.qs8_addsub = xnn_init_qs8_add_minmax_avx512_params,
Marat Dukhane76049a2021-07-22 14:48:59 -07003261 .element_tile = 16,
3262 };
3263 } else if (cpuinfo_has_x86_xop()) {
Marat Dukhanbb9225e2020-09-06 22:40:56 -07003264 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3265 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8,
3266 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
3267 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07003268 .init.qs8_addsub = xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhanbb9225e2020-09-06 22:40:56 -07003269 .element_tile = 8,
3270 };
Marat Dukhan3eac69c2021-07-21 01:42:29 -07003271 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan7679b1e2021-07-20 18:32:23 -07003272 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3273 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_x16,
3274 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
3275 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
Marat Dukhan64287252021-09-07 16:20:03 -07003276 .init.qs8_addsub = xnn_init_qs8_add_minmax_avx2_params,
Marat Dukhan7679b1e2021-07-20 18:32:23 -07003277 .element_tile = 16,
3278 };
Marat Dukhane9c4b962021-04-02 16:56:55 -07003279 } else if (cpuinfo_has_x86_avx()) {
3280 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3281 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_x8,
3282 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
3283 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07003284 .init.qs8_addsub = xnn_init_qs8_add_minmax_sse4_mul32_params,
Marat Dukhane9c4b962021-04-02 16:56:55 -07003285 .element_tile = 8,
3286 };
Marat Dukhanbb9225e2020-09-06 22:40:56 -07003287 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhanff209482020-09-03 14:26:53 -07003288 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3289 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8,
3290 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
3291 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07003292 .init.qs8_addsub = xnn_init_qs8_add_minmax_sse4_mul16_params,
Marat Dukhanff209482020-09-03 14:26:53 -07003293 .element_tile = 8,
3294 };
3295 } else {
3296 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3297 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8,
3298 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
3299 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07003300 .init.qs8_addsub = xnn_init_qs8_add_minmax_sse2_params,
Marat Dukhanff209482020-09-03 14:26:53 -07003301 .element_tile = 8,
3302 };
3303 }
Marat Dukhan0853b8a2021-08-03 01:01:53 -07003304 if (cpuinfo_has_x86_avx()) {
3305 xnn_params.qs8.vmul = (struct vbinary_parameters) {
3306 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3307 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3308 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3309 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse4_params,
3310 .element_tile = 16,
3311 };
3312 } else if (cpuinfo_has_x86_sse4_1()) {
3313 xnn_params.qs8.vmul = (struct vbinary_parameters) {
3314 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3315 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3316 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3317 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse4_params,
3318 .element_tile = 16,
3319 };
3320 } else {
3321 xnn_params.qs8.vmul = (struct vbinary_parameters) {
3322 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3323 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3324 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3325 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse2_params,
3326 .element_tile = 8,
3327 };
3328 }
Marat Dukhan07e50402020-08-05 17:16:53 -07003329 #endif // XNN_NO_QS8_OPERATORS
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07003330
Frank Barchardb40ee632021-12-30 11:10:02 -08003331 /**************************** QU8 x86 micro-kernels ****************************/
Marat Dukhan08b7a972020-07-14 18:17:29 -07003332 #ifndef XNN_NO_QU8_OPERATORS
3333 init_flags |= XNN_INIT_FLAG_QU8;
Marat Dukhan854fb6b2020-06-19 12:33:44 -07003334
Marat Dukhan039a3882022-01-21 14:53:11 -08003335 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhan3cf2e222021-07-08 11:38:45 -07003336 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3337 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3338 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3339 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3340 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3341 xnn_params.qu8.gemm.mr = 4;
3342 xnn_params.qu8.gemm.nr = 16;
3343 xnn_params.qu8.gemm.log2_kr = 3;
3344 } else if (cpuinfo_has_x86_xop()) {
Marat Dukhan902ef7f2021-07-02 16:11:06 -07003345 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3346 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3347 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3348 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3349 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3350 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3351 xnn_params.qu8.gemm.mr = 2;
3352 xnn_params.qu8.gemm.nr = 4;
3353 xnn_params.qu8.gemm.log2_kr = 3;
3354 } else if (cpuinfo_has_x86_avx2()) {
3355 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3356 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
3357 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3358 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3359 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3360 xnn_params.qu8.gemm.mr = 3;
3361 xnn_params.qu8.gemm.nr = 8;
3362 xnn_params.qu8.gemm.log2_kr = 3;
3363 } else if (cpuinfo_has_x86_avx()) {
3364 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3365 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3366 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3367 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3368 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3369 xnn_params.qu8.gemm.mr = 2;
3370 xnn_params.qu8.gemm.nr = 4;
3371 xnn_params.qu8.gemm.log2_kr = 3;
3372 } else if (cpuinfo_has_x86_sse4_1()) {
3373 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3374 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3375 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3376 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3377 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3378 xnn_params.qu8.gemm.mr = 3;
3379 xnn_params.qu8.gemm.nr = 4;
3380 xnn_params.qu8.gemm.log2_kr = 3;
3381 } else {
3382 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3383 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3384 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3385 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3386 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3387 xnn_params.qu8.gemm.mr = 3;
3388 xnn_params.qu8.gemm.nr = 4;
3389 xnn_params.qu8.gemm.log2_kr = 3;
3390 }
Marat Dukhanaefaef32020-04-09 07:09:34 -07003391
Marat Dukhan039a3882022-01-21 14:53:11 -08003392 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhanabee3a72021-07-09 09:04:52 -07003393 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
3394 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3395 xnn_params.qu8.dwconv[0].channel_tile = 32;
3396 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
3397 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3398 xnn_params.qu8.dwconv[1].channel_tile = 32;
3399 } else if (cpuinfo_has_x86_xop()) {
3400 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3401 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32;
3402 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3403 xnn_params.qu8.dwconv[0].channel_tile = 16;
3404 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32;
3405 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3406 xnn_params.qu8.dwconv[1].channel_tile = 16;
3407 } else if (cpuinfo_has_x86_avx2()) {
3408 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
3409 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3410 xnn_params.qu8.dwconv[0].channel_tile = 16;
3411 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
3412 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3413 xnn_params.qu8.dwconv[1].channel_tile = 16;
3414 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhancaa7fc72021-07-27 07:48:24 -07003415 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16;
Marat Dukhanabee3a72021-07-09 09:04:52 -07003416 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3417 xnn_params.qu8.dwconv[0].channel_tile = 16;
Marat Dukhancaa7fc72021-07-27 07:48:24 -07003418 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16;
Marat Dukhanabee3a72021-07-09 09:04:52 -07003419 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3420 xnn_params.qu8.dwconv[1].channel_tile = 16;
3421 } else if (cpuinfo_has_x86_sse4_1()) {
3422 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16;
3423 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3424 xnn_params.qu8.dwconv[0].channel_tile = 8;
3425 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16;
3426 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3427 xnn_params.qu8.dwconv[1].channel_tile = 8;
3428 } else if (cpuinfo_has_x86_sse2()) {
3429 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16;
3430 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3431 xnn_params.qu8.dwconv[0].channel_tile = 8;
3432 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16;
3433 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3434 xnn_params.qu8.dwconv[1].channel_tile = 8;
3435 }
Marat Dukhan08b7a972020-07-14 18:17:29 -07003436 xnn_params.qu8.dwconv[0].primary_tile = 9;
Marat Dukhanabee3a72021-07-09 09:04:52 -07003437 xnn_params.qu8.dwconv[1].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003438
Marat Dukhan08b7a972020-07-14 18:17:29 -07003439 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08003440 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__sse2_c8,
3441 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8,
Marat Dukhan3c949a32022-01-09 20:12:33 -08003442 .init.qu8 = xnn_init_qu8_avgpool_minmax_sse2_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08003443 .primary_tile = 9,
3444 .incremental_tile = 8,
3445 .channel_tile = 8,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003446 };
Marat Dukhand1f53e42022-01-12 22:34:51 -08003447 if (cpuinfo_has_x86_sse4_1()) {
3448 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
3449 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8,
3450 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8,
3451 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_sse4_params,
3452 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_sse4_params,
3453 .row_tile = 7,
3454 .channel_tile = 8,
3455 };
3456 } else {
3457 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
3458 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8,
3459 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8,
3460 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_sse2_params,
3461 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_sse2_params,
3462 .row_tile = 7,
3463 .channel_tile = 8,
3464 };
3465 }
Marat Dukhan3eac69c2021-07-21 01:42:29 -07003466
Marat Dukhan039a3882022-01-21 14:53:11 -08003467 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhane76049a2021-07-22 14:48:59 -07003468 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3469 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16,
3470 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
3471 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
Marat Dukhan64287252021-09-07 16:20:03 -07003472 .init.qu8_addsub = xnn_init_qu8_add_minmax_avx512_params,
Marat Dukhane76049a2021-07-22 14:48:59 -07003473 .element_tile = 16,
3474 };
3475 } else if (cpuinfo_has_x86_xop()) {
Marat Dukhan3eac69c2021-07-21 01:42:29 -07003476 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3477 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__xop_mul32_ld32_x8,
3478 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
3479 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07003480 .init.qu8_addsub = xnn_init_qu8_add_minmax_sse4_params,
Marat Dukhan3eac69c2021-07-21 01:42:29 -07003481 .element_tile = 8,
3482 };
3483 } else if (cpuinfo_has_x86_avx2()) {
3484 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3485 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_x16,
3486 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
3487 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
Marat Dukhan64287252021-09-07 16:20:03 -07003488 .init.qu8_addsub = xnn_init_qu8_add_minmax_avx2_params,
Marat Dukhan3eac69c2021-07-21 01:42:29 -07003489 .element_tile = 16,
3490 };
3491 } else if (cpuinfo_has_x86_avx()) {
3492 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3493 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx_mul32_ld32_x8,
3494 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
3495 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07003496 .init.qu8_addsub = xnn_init_qu8_add_minmax_sse4_params,
Marat Dukhan3eac69c2021-07-21 01:42:29 -07003497 .element_tile = 8,
3498 };
3499 } else if (cpuinfo_has_x86_sse4_1()) {
3500 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3501 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__sse41_mul16_ld64_x8,
3502 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
3503 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07003504 .init.qu8_addsub = xnn_init_qu8_add_minmax_sse2_params,
Marat Dukhan3eac69c2021-07-21 01:42:29 -07003505 .element_tile = 8,
3506 };
3507 } else {
3508 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3509 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__sse2_mul16_ld64_x8,
3510 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
3511 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
Marat Dukhan64287252021-09-07 16:20:03 -07003512 .init.qu8_addsub = xnn_init_qu8_add_minmax_sse2_params,
Marat Dukhan3eac69c2021-07-21 01:42:29 -07003513 .element_tile = 8,
3514 };
3515 }
Marat Dukhan0853b8a2021-08-03 01:01:53 -07003516 if (cpuinfo_has_x86_avx()) {
3517 xnn_params.qu8.vmul = (struct vbinary_parameters) {
3518 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3519 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3520 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3521 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
3522 .element_tile = 16,
3523 };
3524 } else if (cpuinfo_has_x86_sse4_1()) {
3525 xnn_params.qu8.vmul = (struct vbinary_parameters) {
3526 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3527 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3528 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3529 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
3530 .element_tile = 16,
3531 };
3532 } else {
3533 xnn_params.qu8.vmul = (struct vbinary_parameters) {
3534 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3535 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3536 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3537 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
3538 .element_tile = 8,
3539 };
3540 }
Marat Dukhan08b7a972020-07-14 18:17:29 -07003541 #endif // XNN_NO_QU8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07003542
Frank Barchardb40ee632021-12-30 11:10:02 -08003543 /**************************** U8 x86 micro-kernels ****************************/
Marat Dukhan23147532021-08-16 07:26:56 -07003544 #ifndef XNN_NO_S8_OPERATORS
3545 init_flags |= XNN_INIT_FLAG_S8;
3546
3547 if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07003548 xnn_params.s8.clamp = (struct vunary_parameters) {
3549 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__sse41_x64,
3550 .init.s8_minmax = xnn_init_s8_minmax_sse4_params,
3551 .element_tile = 64,
3552 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08003553 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
3554 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__sse41_c16,
3555 .pixel_tile = 1,
3556 .channel_tile = 16,
3557 };
Marat Dukhan23147532021-08-16 07:26:56 -07003558 xnn_params.s8.maxpool = (struct maxpool_parameters) {
3559 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__sse41_c16,
3560 .init.s8 = xnn_init_s8_minmax_sse4_params,
3561 .mr = 9,
3562 .qr = 8,
3563 };
3564 } else {
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07003565 xnn_params.s8.clamp = (struct vunary_parameters) {
3566 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__sse2_x64,
3567 .init.s8_minmax = xnn_init_s8_minmax_sse2_params,
3568 .element_tile = 64,
3569 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08003570 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
3571 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__sse2_c8,
3572 .pixel_tile = 1,
3573 .channel_tile = 8,
3574 };
Marat Dukhan23147532021-08-16 07:26:56 -07003575 xnn_params.s8.maxpool = (struct maxpool_parameters) {
3576 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__sse2_c16,
3577 .init.s8 = xnn_init_s8_minmax_sse2_params,
3578 .mr = 9,
3579 .qr = 8,
3580 };
3581 }
Marat Dukhan94912792021-08-16 21:40:30 -07003582 #endif // XNN_NO_S8_OPERATORS
Marat Dukhan23147532021-08-16 07:26:56 -07003583
Frank Barchardb40ee632021-12-30 11:10:02 -08003584 /**************************** U8 x86 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003585 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07003586 init_flags |= XNN_INIT_FLAG_U8;
3587
Marat Dukhan94912792021-08-16 21:40:30 -07003588 xnn_params.u8.clamp = (struct vunary_parameters) {
3589 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__sse2_x64,
3590 .init.u8_minmax = xnn_init_u8_minmax_sse2_params,
3591 .element_tile = 64,
3592 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08003593 if (cpuinfo_has_x86_sse4_1()) {
3594 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
3595 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__sse41_c16,
3596 .pixel_tile = 1,
3597 .channel_tile = 16,
3598 };
3599 } else {
3600 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
3601 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__sse2_c8,
3602 .pixel_tile = 1,
3603 .channel_tile = 8,
3604 };
3605 }
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003606 xnn_params.u8.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07003607 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16,
Marat Dukhan91ae1652021-08-15 19:19:49 -07003608 .init.u8 = xnn_init_u8_minmax_sse2_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003609 .mr = 9,
3610 .qr = 8,
3611 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003612 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
3613 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__sse2;
3614 #endif // XNN_NO_U8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07003615
Frank Barchardb40ee632021-12-30 11:10:02 -08003616 /**************************** X8 x86 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003617 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07003618 init_flags |= XNN_INIT_FLAG_X8;
3619
Marat Dukhan039a3882022-01-21 14:53:11 -08003620 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhan98e054b2021-09-13 09:43:50 -07003621 xnn_params.x8.lut = xnn_x8_lut_ukernel__avx512skx_vpshufb_x64;
3622 } else if (cpuinfo_has_x86_avx2()) {
3623 xnn_params.x8.lut = xnn_x8_lut_ukernel__avx2_x128;
3624 } else if (cpuinfo_has_x86_avx()) {
3625 xnn_params.x8.lut = xnn_x8_lut_ukernel__avx_x64;
3626 } else {
3627 // Note: SSSE3 version is usually slower than scalar
3628 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
3629 }
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003630 xnn_params.x8.zip = (struct zip_parameters) {
3631 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__sse2,
3632 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__sse2,
3633 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__sse2,
3634 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__sse2,
3635 };
3636 #endif // XNN_NO_X8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07003637
Marat Dukhan8f920a62022-01-19 14:56:23 -08003638 /**************************** F16 x86 micro-kernels ****************************/
3639 #ifndef XNN_NO_F16_OPERATORS
3640 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx2()) {
3641 init_flags |= XNN_INIT_FLAG_F16;
3642
3643 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast);
3644 xnn_params.f16.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast);
3645 xnn_params.f16.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast);
3646 xnn_params.f16.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast);
3647 xnn_params.f16.gemm.init.f16 = xnn_init_f16_scaleminmax_avx_params;
3648 xnn_params.f16.gemm.mr = 4;
3649 xnn_params.f16.gemm.nr = 16;
3650
3651 xnn_params.f16.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x4__fma3;
3652 xnn_params.f16.dwconv[0].init.f16 = xnn_init_f16_minmax_avx_params;
3653 xnn_params.f16.dwconv[0].channel_tile = 16;
3654 xnn_params.f16.dwconv[0].primary_tile = 4;
3655
3656 xnn_params.f16.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x9__fma3;
3657 xnn_params.f16.dwconv[1].init.f16 = xnn_init_f16_minmax_avx_params;
3658 xnn_params.f16.dwconv[1].channel_tile = 16;
3659 xnn_params.f16.dwconv[1].primary_tile = 9;
3660
3661 xnn_params.f16.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up8x25__fma3_acc2;
3662 xnn_params.f16.dwconv[2].init.f16 = xnn_init_f16_minmax_avx_params;
3663 xnn_params.f16.dwconv[2].channel_tile = 8;
3664 xnn_params.f16.dwconv[2].primary_tile = 25;
3665
3666 xnn_params.f16.gavgpool = (struct gavgpool_parameters) {
3667 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8,
3668 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8,
3669 .init.f16 = xnn_init_f16_scaleminmax_avx_params,
3670 .update.f16 = xnn_update_f16_scaleminmax_avx_params,
3671 .row_tile = 7,
3672 .channel_tile = 8,
3673 };
Marat Dukhan0a756b52022-02-03 23:08:50 -08003674
Marat Dukhan5756a922022-02-04 01:55:53 -08003675 xnn_params.f16.maxpool = (struct maxpool_parameters) {
3676 .ukernel = (xnn_maxpool_ukernel_function) xnn_f16_maxpool_minmax_ukernel_9p8x__f16c_c8,
3677 .init.f16 = xnn_init_f16_minmax_avx_params,
3678 .mr = 9,
3679 .qr = 8,
3680 };
3681
Marat Dukhan0a756b52022-02-03 23:08:50 -08003682 xnn_params.f16.prelu = (struct prelu_parameters) {
3683 .ukernel = (xnn_prelu_ukernel_function) xnn_f16_prelu_ukernel__f16c_2x16,
3684 .row_tile = 2,
3685 .channel_tile = 16,
3686 };
3687
Marat Dukhan8f920a62022-01-19 14:56:23 -08003688 xnn_params.f16.vadd = (struct vbinary_parameters) {
3689 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vadd_minmax_ukernel__f16c_x16,
3690 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__f16c_x16,
3691 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__f16c_x16,
3692 .init.f16_minmax = xnn_init_f16_minmax_avx_params,
3693 .element_tile = 16,
3694 };
3695 xnn_params.f16.vmul = (struct vbinary_parameters) {
3696 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmul_minmax_ukernel__f16c_x16,
3697 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__f16c_x16,
3698 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__f16c_x16,
3699 .init.f16_minmax = xnn_init_f16_minmax_avx_params,
3700 .element_tile = 16,
3701 };
3702 xnn_params.f16.vmulcaddc = (struct vmulcaddc_parameters) {
3703 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f16_vmulcaddc_minmax_ukernel_c8__fma3_2x,
3704 .init.f16 = xnn_init_f16_minmax_avx_params,
3705 .channel_tile = 8,
3706 .row_tile = 2,
3707 };
3708 xnn_params.f16.hswish = (struct vunary_parameters) {
3709 .ukernel = (xnn_univector_ukernel_function) xnn_f16_vhswish_ukernel__f16c_x16,
3710 .init.f16_hswish = xnn_init_f16_hswish_avx_params,
3711 .element_tile = 16,
3712 };
3713 }
3714 #endif // XNN_NO_F16_OPERATORS
3715
Frank Barchardb40ee632021-12-30 11:10:02 -08003716 /**************************** F32 x86 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003717 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07003718 init_flags |= XNN_INIT_FLAG_F32;
3719
Marat Dukhan0f349c42019-11-27 11:58:54 -08003720 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07003721 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast);
3722 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast);
3723 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast);
3724 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003725 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003726 xnn_params.f32.gemm.mr = 7;
3727 xnn_params.f32.gemm.nr = 16;
Marat Dukhan48976702022-01-10 18:18:04 -08003728 } else if (cpuinfo_has_x86_fma3()) {
Marat Dukhan27121322019-12-09 14:57:40 -08003729 switch (cpuinfo_get_core(0)->uarch) {
3730 case cpuinfo_uarch_zen:
Marat Dukhanb3801eb2020-03-12 13:41:11 -07003731 case cpuinfo_uarch_dhyana:
Marat Dukhanaefaef32020-04-09 07:09:34 -07003732 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast);
3733 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast);
3734 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast);
3735 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003736 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003737 xnn_params.f32.gemm.mr = 4;
3738 xnn_params.f32.gemm.nr = 16;
3739 xnn_params.f32.gemm.log2_sr = 2;
Marat Dukhan27121322019-12-09 14:57:40 -08003740 break;
3741 default:
Marat Dukhanaefaef32020-04-09 07:09:34 -07003742 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast);
3743 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast);
3744 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast);
3745 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003746 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003747 xnn_params.f32.gemm.mr = 5;
3748 xnn_params.f32.gemm.nr = 16;
Marat Dukhan27121322019-12-09 14:57:40 -08003749 break;
3750 }
Marat Dukhan48976702022-01-10 18:18:04 -08003751 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07003752 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast);
3753 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast);
3754 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast);
3755 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003756 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003757 xnn_params.f32.gemm.mr = 5;
3758 xnn_params.f32.gemm.nr = 16;
Marat Dukhan1025ea32019-11-21 16:01:08 -08003759 } else {
Marat Dukhanaefaef32020-04-09 07:09:34 -07003760 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__sse_load1);
3761 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__sse_load1);
3762 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__sse_load1);
3763 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__sse_load1);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003764 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_sse_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003765 xnn_params.f32.gemm.mr = 4;
3766 xnn_params.f32.gemm.nr = 8;
Marat Dukhan1025ea32019-11-21 16:01:08 -08003767 }
Marat Dukhanaefaef32020-04-09 07:09:34 -07003768 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__sse);
3769 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__sse);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003770 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_sse_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003771 xnn_params.f32.gemm2.mr = 4;
3772 xnn_params.f32.gemm2.nr = 2;
3773 xnn_params.f32.gemm2.log2_kr = 2;
3774
Marat Dukhan479f87e2019-11-27 15:17:06 -08003775 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003776 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__avx512f;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003777 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003778 xnn_params.f32.dwconv[0].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003779 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003780
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003781 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__avx512f;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003782 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003783 xnn_params.f32.dwconv[1].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003784 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003785
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003786 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__avx512f;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003787 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003788 xnn_params.f32.dwconv[2].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003789 xnn_params.f32.dwconv[2].primary_tile = 9;
3790
3791 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x25__avx512f;
3792 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
3793 xnn_params.f32.dwconv[3].channel_tile = 16;
3794 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhan48976702022-01-10 18:18:04 -08003795 } else if (cpuinfo_has_x86_fma3()) {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003796 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__fma3;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003797 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003798 xnn_params.f32.dwconv[0].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003799 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003800
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003801 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__fma3;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003802 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003803 xnn_params.f32.dwconv[1].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003804 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003805
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003806 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__fma3;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003807 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_avx_params;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003808 xnn_params.f32.dwconv[2].channel_tile = 16;
3809 xnn_params.f32.dwconv[2].primary_tile = 9;
3810
3811 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__fma3;
3812 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_avx_params;
3813 xnn_params.f32.dwconv[3].channel_tile = 8;
3814 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhan48976702022-01-10 18:18:04 -08003815 } else if (cpuinfo_has_x86_avx()) {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003816 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__avx;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003817 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003818 xnn_params.f32.dwconv[0].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003819 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003820
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003821 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__avx;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003822 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_avx_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003823 xnn_params.f32.dwconv[1].channel_tile = 16;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003824 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003825
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003826 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__avx;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003827 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_avx_params;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003828 xnn_params.f32.dwconv[2].channel_tile = 16;
3829 xnn_params.f32.dwconv[2].primary_tile = 9;
3830
3831 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__avx;
3832 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_avx_params;
3833 xnn_params.f32.dwconv[3].channel_tile = 8;
3834 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhan17ec5f32019-11-22 13:34:16 -08003835 } else {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003836 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__sse;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003837 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_sse_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003838 xnn_params.f32.dwconv[0].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003839 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003840
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003841 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__sse;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003842 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_sse_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003843 xnn_params.f32.dwconv[1].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003844 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003845
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003846 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__sse;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07003847 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_sse_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07003848 xnn_params.f32.dwconv[2].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07003849 xnn_params.f32.dwconv[2].primary_tile = 9;
3850
3851 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__sse;
3852 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_sse_params;
3853 xnn_params.f32.dwconv[3].channel_tile = 8;
3854 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhan17ec5f32019-11-22 13:34:16 -08003855 }
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003856 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08003857 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__sse_c4,
3858 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4,
Marat Dukhan84598222022-01-09 21:39:22 -08003859 .init.f32 = xnn_init_f32_scaleminmax_sse_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08003860 .primary_tile = 9,
3861 .incremental_tile = 8,
3862 .channel_tile = 4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003863 };
3864 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08003865 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__sse_c4,
3866 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4,
3867 .primary_tile = 9,
3868 .incremental_tile = 8,
3869 .channel_tile = 4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003870 };
3871 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08003872 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4,
3873 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4,
Marat Dukhan84598222022-01-09 21:39:22 -08003874 .init.f32 = xnn_init_f32_scaleminmax_sse_params,
3875 .update.f32 = xnn_update_f32_scaleminmax_sse_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08003876 .row_tile = 7,
3877 .channel_tile = 4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003878 };
3879 xnn_params.f32.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07003880 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4,
Marat Dukhan91ae1652021-08-15 19:19:49 -07003881 .init.f32 = xnn_init_f32_minmax_sse_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003882 .mr = 9,
3883 .qr = 8,
3884 };
3885 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07003886 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__sse2_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003887 .mr = 4,
3888 };
3889 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07003890 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__sse2_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003891 .mr = 9,
3892 };
3893 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07003894 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07003895 .mr = 9,
3896 .qr = 8,
3897 };
Marat Dukhan660fd192020-03-10 04:55:30 -07003898 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
3899 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__sse_c8,
Marat Dukhan69722492019-11-11 19:55:50 -08003900 .pixel_tile = 1,
3901 .channel_tile = 8,
3902 };
Marat Dukhane2c3f292019-11-27 15:40:54 -08003903 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhane5efb162021-12-31 10:26:13 -08003904 xnn_params.f32.abs = (struct vunary_parameters) {
3905 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__avx512f_x16,
3906 .init.f32_abs = xnn_init_f32_abs_avx512_params,
3907 .element_tile = 16,
3908 };
Marat Dukhan48976702022-01-10 18:18:04 -08003909 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhane5efb162021-12-31 10:26:13 -08003910 xnn_params.f32.abs = (struct vunary_parameters) {
3911 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__avx_x16,
3912 .init.f32_abs = xnn_init_f32_abs_avx_params,
3913 .element_tile = 16,
3914 };
Marat Dukhan5020b962020-06-08 13:30:10 -07003915 } else {
Marat Dukhane5efb162021-12-31 10:26:13 -08003916 xnn_params.f32.abs = (struct vunary_parameters) {
3917 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__sse_x8,
3918 .init.f32_abs = xnn_init_f32_abs_sse_params,
3919 .element_tile = 8,
3920 };
Marat Dukhan5020b962020-06-08 13:30:10 -07003921 }
3922 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan94912792021-08-16 21:40:30 -07003923 xnn_params.f32.clamp = (struct vunary_parameters) {
3924 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__avx512f_x16,
3925 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
3926 .element_tile = 16,
3927 };
Marat Dukhan48976702022-01-10 18:18:04 -08003928 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan94912792021-08-16 21:40:30 -07003929 xnn_params.f32.clamp = (struct vunary_parameters) {
3930 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__avx_x16,
3931 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
3932 .element_tile = 16,
3933 };
Marat Dukhane2c3f292019-11-27 15:40:54 -08003934 } else {
Marat Dukhan94912792021-08-16 21:40:30 -07003935 xnn_params.f32.clamp = (struct vunary_parameters) {
3936 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__sse_x8,
3937 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
3938 .element_tile = 8,
3939 };
Marat Dukhane2c3f292019-11-27 15:40:54 -08003940 }
Marat Dukhan662faa02019-12-09 22:48:16 -08003941 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08003942 xnn_params.f32.elu = (struct vunary_parameters) {
3943 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x64,
3944 .init.f32_elu = xnn_init_f32_elu_avx512_rr1_lut16_p3_params,
3945 .element_tile = 64,
3946 };
Marat Dukhan48976702022-01-10 18:18:04 -08003947 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08003948 xnn_params.f32.elu = (struct vunary_parameters) {
3949 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56,
3950 .init.f32_elu = xnn_init_f32_elu_avx2_rr1_lut4_p4_params,
3951 .element_tile = 56,
3952 };
Marat Dukhan48976702022-01-10 18:18:04 -08003953 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08003954 xnn_params.f32.elu = (struct vunary_parameters) {
3955 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x32,
3956 .init.f32_elu = xnn_init_f32_elu_avx_rr2_lut4_p4_params,
3957 .element_tile = 32,
3958 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08003959 } else {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08003960 xnn_params.f32.elu = (struct vunary_parameters) {
3961 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__sse2_rr2_lut16_p3_x12,
3962 .init.f32_elu = xnn_init_f32_elu_sse2_rr2_lut16_p3_params,
3963 .element_tile = 12,
3964 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08003965 }
3966 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan561d0682021-12-23 16:12:35 -08003967 xnn_params.f32.hswish = (struct vunary_parameters) {
3968 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__avx512f_x16,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08003969 .init.f32_hswish = xnn_init_f32_hswish_avx512_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08003970 .element_tile = 16,
3971 };
Marat Dukhan48976702022-01-10 18:18:04 -08003972 } else if (cpuinfo_has_x86_fma3()) {
Marat Dukhan561d0682021-12-23 16:12:35 -08003973 xnn_params.f32.hswish = (struct vunary_parameters) {
3974 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__fma3_x16,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08003975 .init.f32_hswish = xnn_init_f32_hswish_avx_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08003976 .element_tile = 16,
3977 };
Marat Dukhan48976702022-01-10 18:18:04 -08003978 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan561d0682021-12-23 16:12:35 -08003979 xnn_params.f32.hswish = (struct vunary_parameters) {
3980 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__avx_x16,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08003981 .init.f32_hswish = xnn_init_f32_hswish_avx_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08003982 .element_tile = 16,
3983 };
Marat Dukhan662faa02019-12-09 22:48:16 -08003984 } else {
Marat Dukhan561d0682021-12-23 16:12:35 -08003985 xnn_params.f32.hswish = (struct vunary_parameters) {
3986 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__sse_x8,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08003987 .init.f32_hswish = xnn_init_f32_hswish_sse_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08003988 .element_tile = 8,
3989 };
Marat Dukhan662faa02019-12-09 22:48:16 -08003990 }
Marat Dukhan5020b962020-06-08 13:30:10 -07003991 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan2894e992021-12-30 08:29:48 -08003992 xnn_params.f32.lrelu = (struct vunary_parameters) {
3993 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__avx512f_x16,
3994 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
3995 .element_tile = 16,
3996 };
Marat Dukhan48976702022-01-10 18:18:04 -08003997 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan2894e992021-12-30 08:29:48 -08003998 xnn_params.f32.lrelu = (struct vunary_parameters) {
3999 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__avx_x16,
4000 .init.f32_lrelu = xnn_init_f32_lrelu_avx_params,
4001 .element_tile = 16,
4002 };
Marat Dukhan0d3f4672020-06-25 16:42:58 -07004003 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhan2894e992021-12-30 08:29:48 -08004004 xnn_params.f32.lrelu = (struct vunary_parameters) {
4005 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__sse41_x8,
4006 .init.f32_lrelu = xnn_init_f32_lrelu_sse_params,
4007 .element_tile = 8,
4008 };
Marat Dukhan28813332020-06-10 18:05:38 -07004009 } else {
Marat Dukhan2894e992021-12-30 08:29:48 -08004010 xnn_params.f32.lrelu = (struct vunary_parameters) {
4011 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__sse_x8,
4012 .init.f32_lrelu = xnn_init_f32_lrelu_sse_params,
4013 .element_tile = 8,
4014 };
Marat Dukhan28813332020-06-10 18:05:38 -07004015 }
4016 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhane5efb162021-12-31 10:26:13 -08004017 xnn_params.f32.neg = (struct vunary_parameters) {
4018 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__avx512f_x16,
4019 .init.f32_neg = xnn_init_f32_neg_avx512_params,
4020 .element_tile = 16,
4021 };
Marat Dukhan48976702022-01-10 18:18:04 -08004022 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhane5efb162021-12-31 10:26:13 -08004023 xnn_params.f32.neg = (struct vunary_parameters) {
4024 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__avx_x16,
4025 .init.f32_neg = xnn_init_f32_neg_avx_params,
4026 .element_tile = 16,
4027 };
Marat Dukhan5020b962020-06-08 13:30:10 -07004028 } else {
Marat Dukhane5efb162021-12-31 10:26:13 -08004029 xnn_params.f32.neg = (struct vunary_parameters) {
4030 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__sse_x8,
4031 .init.f32_neg = xnn_init_f32_neg_sse_params,
4032 .element_tile = 8,
4033 };
Marat Dukhan5020b962020-06-08 13:30:10 -07004034 }
Marat Dukhan64e52512020-06-09 13:41:16 -07004035 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan0e801372022-01-04 00:10:41 -08004036 xnn_params.f32.rndne = (struct vunary_parameters) {
4037 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__avx512f_x16,
4038 .element_tile = 16,
4039 };
4040 xnn_params.f32.rndz = (struct vunary_parameters) {
4041 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__avx512f_x16,
4042 .element_tile = 16,
4043 };
4044 xnn_params.f32.rndu = (struct vunary_parameters) {
4045 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__avx512f_x16,
4046 .element_tile = 16,
4047 };
4048 xnn_params.f32.rndd = (struct vunary_parameters) {
4049 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__avx512f_x16,
4050 .element_tile = 16,
4051 };
Marat Dukhan48976702022-01-10 18:18:04 -08004052 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan0e801372022-01-04 00:10:41 -08004053 xnn_params.f32.rndne = (struct vunary_parameters) {
4054 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__avx_x16,
4055 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4056 .element_tile = 16,
4057 };
4058 xnn_params.f32.rndz = (struct vunary_parameters) {
4059 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__avx_x16,
4060 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4061 .element_tile = 16,
4062 };
4063 xnn_params.f32.rndu = (struct vunary_parameters) {
4064 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__avx_x16,
4065 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4066 .element_tile = 16,
4067 };
4068 xnn_params.f32.rndd = (struct vunary_parameters) {
4069 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__avx_x16,
4070 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4071 .element_tile = 16,
4072 };
Marat Dukhan64e52512020-06-09 13:41:16 -07004073 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhan0e801372022-01-04 00:10:41 -08004074 xnn_params.f32.rndne = (struct vunary_parameters) {
4075 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__sse41_x8,
4076 .element_tile = 8,
4077 };
4078 xnn_params.f32.rndz = (struct vunary_parameters) {
4079 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__sse41_x8,
4080 .element_tile = 8,
4081 };
4082 xnn_params.f32.rndu = (struct vunary_parameters) {
4083 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__sse41_x8,
4084 .element_tile = 8,
4085 };
4086 xnn_params.f32.rndd = (struct vunary_parameters) {
4087 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__sse41_x8,
4088 .element_tile = 8,
4089 };
Marat Dukhan64e52512020-06-09 13:41:16 -07004090 } else {
Marat Dukhan0e801372022-01-04 00:10:41 -08004091 xnn_params.f32.rndne = (struct vunary_parameters) {
4092 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__sse2_x8,
4093 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4094 .element_tile = 8,
4095 };
4096 xnn_params.f32.rndz = (struct vunary_parameters) {
4097 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__sse2_x8,
4098 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4099 .element_tile = 8,
4100 };
4101 xnn_params.f32.rndu = (struct vunary_parameters) {
4102 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__sse2_x8,
4103 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4104 .element_tile = 8,
4105 };
4106 xnn_params.f32.rndd = (struct vunary_parameters) {
4107 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__sse2_x8,
4108 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4109 .element_tile = 8,
4110 };
Marat Dukhan64e52512020-06-09 13:41:16 -07004111 }
Marat Dukhand9ca7e62020-09-23 23:45:29 -07004112 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhance834ad2022-01-03 00:22:01 -08004113 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4114 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x64,
4115 .init.f32_sigmoid = xnn_init_f32_sigmoid_avx512_rr2_lut32_p2_params,
4116 .element_tile = 64,
4117 };
Marat Dukhan48976702022-01-10 18:18:04 -08004118 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhance834ad2022-01-03 00:22:01 -08004119 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4120 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_x40,
4121 .init.f32_sigmoid = xnn_init_f32_sigmoid_avx2_rr1_p5_params,
4122 .element_tile = 40,
4123 };
Marat Dukhan48976702022-01-10 18:18:04 -08004124 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhance834ad2022-01-03 00:22:01 -08004125 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4126 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_x40,
4127 .init.f32_sigmoid = xnn_init_f32_sigmoid_avx_rr2_p5_params,
4128 .element_tile = 40,
4129 };
Marat Dukhan6dd71362020-09-17 23:11:21 -07004130 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhance834ad2022-01-03 00:22:01 -08004131 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4132 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_x8,
4133 .init.f32_sigmoid = xnn_init_f32_sigmoid_sse2_rr2_lut64_p2_params,
4134 .element_tile = 8,
4135 };
Marat Dukhanfa0a4322020-01-06 16:14:29 -08004136 } else {
Marat Dukhance834ad2022-01-03 00:22:01 -08004137 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4138 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__sse2_rr2_lut64_p2_div_x8,
4139 .init.f32_sigmoid = xnn_init_f32_sigmoid_sse2_rr2_lut64_p2_params,
4140 .element_tile = 8,
4141 };
Marat Dukhanfa0a4322020-01-06 16:14:29 -08004142 }
Marat Dukhan90eca0a2020-03-11 00:52:23 -07004143 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhane5efb162021-12-31 10:26:13 -08004144 xnn_params.f32.sqr = (struct vunary_parameters) {
4145 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__avx512f_x16,
4146 .element_tile = 16,
4147 };
Marat Dukhan48976702022-01-10 18:18:04 -08004148 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhane5efb162021-12-31 10:26:13 -08004149 xnn_params.f32.sqr = (struct vunary_parameters) {
4150 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__avx_x16,
4151 .init.f32_default = xnn_init_f32_default_avx_params,
4152 .element_tile = 16,
4153 };
Marat Dukhan5020b962020-06-08 13:30:10 -07004154 } else {
Marat Dukhane5efb162021-12-31 10:26:13 -08004155 xnn_params.f32.sqr = (struct vunary_parameters) {
4156 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__sse_x8,
4157 .element_tile = 8,
4158 };
Marat Dukhan5020b962020-06-08 13:30:10 -07004159 }
Marat Dukhan48976702022-01-10 18:18:04 -08004160 if (cpuinfo_has_x86_avx()) {
Marat Dukhane72b2822021-12-30 14:46:58 -08004161 xnn_params.f32.sqrt = (struct vunary_parameters) {
4162 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__avx_sqrt_x8,
4163 .init.f32_sqrt = xnn_init_f32_sqrt_avx_params,
4164 .element_tile = 8,
4165 };
Marat Dukhan6804bbd2020-06-30 19:26:11 -07004166 } else {
Marat Dukhane72b2822021-12-30 14:46:58 -08004167 xnn_params.f32.sqrt = (struct vunary_parameters) {
4168 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__sse_sqrt_x4,
4169 .element_tile = 4,
4170 };
Marat Dukhan6804bbd2020-06-30 19:26:11 -07004171 }
Marat Dukhan5020b962020-06-08 13:30:10 -07004172 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
Marat Dukhan90eca0a2020-03-11 00:52:23 -07004173 xnn_params.f32.prelu = (struct prelu_parameters) {
4174 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__avx512f_2x16,
4175 .row_tile = 2,
4176 .channel_tile = 16,
4177 };
Marat Dukhan48976702022-01-10 18:18:04 -08004178 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan90eca0a2020-03-11 00:52:23 -07004179 xnn_params.f32.prelu = (struct prelu_parameters) {
4180 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__avx_2x16,
4181 .row_tile = 2,
4182 .channel_tile = 16,
4183 };
Marat Dukhan39b5e942020-06-24 15:03:48 -07004184 } else if (cpuinfo_has_x86_sse4_1()) {
4185 xnn_params.f32.prelu = (struct prelu_parameters) {
4186 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__sse41_2x8,
4187 .row_tile = 2,
4188 .channel_tile = 8,
4189 };
Marat Dukhan90eca0a2020-03-11 00:52:23 -07004190 } else {
4191 xnn_params.f32.prelu = (struct prelu_parameters) {
4192 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__sse2_2x8,
4193 .row_tile = 2,
4194 .channel_tile = 8,
4195 };
4196 }
Marat Dukhan4a5c7712022-01-05 22:43:13 -08004197 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
4198 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_x20_acc2,
4199 .init = xnn_init_f32_expminus_sse2_rr2_p5_params,
4200 .element_tile = 20,
4201 };
Marat Dukhan1edc4542020-01-27 12:40:13 -08004202 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__sse;
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004203 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4204 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004205 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__avx512f_x32,
4206 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx512f_x32,
4207 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx512f_x32,
Marat Dukhanf6004972021-12-30 11:23:02 -08004208 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004209 .element_tile = 32,
4210 };
4211 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004212 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__avx512f_x32,
4213 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__avx512f_x32,
4214 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__avx512f_x32,
Marat Dukhanf6004972021-12-30 11:23:02 -08004215 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004216 .element_tile = 32,
4217 };
4218 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004219 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__avx512f_x32,
4220 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx512f_x32,
4221 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx512f_x32,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004222 .element_tile = 32,
4223 };
4224 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004225 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__avx512f_x32,
4226 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx512f_x32,
4227 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx512f_x32,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004228 .element_tile = 32,
4229 };
4230 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004231 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__avx512f_x32,
4232 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
4233 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
Marat Dukhanf6004972021-12-30 11:23:02 -08004234 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004235 .element_tile = 32,
4236 };
4237 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004238 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__avx512f_x32,
4239 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__avx512f_x32,
4240 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__avx512f_x32,
Marat Dukhanf6004972021-12-30 11:23:02 -08004241 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004242 .element_tile = 32,
4243 };
Marat Dukhanf7399262020-06-05 10:58:44 -07004244 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004245 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__avx512f_x32,
4246 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx512f_x32,
4247 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx512f_x32,
Marat Dukhanf7399262020-06-05 10:58:44 -07004248 .element_tile = 32,
4249 };
Marat Dukhan48976702022-01-10 18:18:04 -08004250 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004251 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004252 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__avx_x16,
4253 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx_x16,
4254 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08004255 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004256 .element_tile = 16,
4257 };
4258 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004259 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__avx_x16,
4260 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__avx_x16,
4261 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__avx_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08004262 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004263 .element_tile = 16,
4264 };
4265 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004266 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__avx_x16,
4267 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx_x16,
4268 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx_x16,
Marat Dukhan98c52152021-12-30 13:31:00 -08004269 .init.f32_default = xnn_init_f32_default_avx_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004270 .element_tile = 16,
4271 };
4272 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004273 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__avx_x16,
4274 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx_x16,
4275 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx_x16,
Marat Dukhan98c52152021-12-30 13:31:00 -08004276 .init.f32_default = xnn_init_f32_default_avx_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004277 .element_tile = 16,
4278 };
4279 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004280 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__avx_x16,
4281 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx_x16,
4282 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08004283 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004284 .element_tile = 16,
4285 };
4286 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004287 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__avx_x16,
4288 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__avx_x16,
4289 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__avx_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08004290 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004291 .element_tile = 16,
4292 };
Marat Dukhanf7399262020-06-05 10:58:44 -07004293 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004294 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__avx_x16,
4295 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx_x16,
4296 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx_x16,
Marat Dukhan98c52152021-12-30 13:31:00 -08004297 .init.f32_default = xnn_init_f32_default_avx_params,
Marat Dukhanf7399262020-06-05 10:58:44 -07004298 .element_tile = 16,
4299 };
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004300 } else {
4301 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004302 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__sse_x8,
4303 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__sse_x8,
4304 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__sse_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08004305 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004306 .element_tile = 8,
4307 };
4308 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004309 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__sse_x8,
4310 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__sse_x8,
4311 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__sse_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08004312 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004313 .element_tile = 8,
4314 };
4315 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004316 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__sse_x8,
4317 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__sse_x8,
4318 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__sse_x8,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004319 .element_tile = 8,
4320 };
4321 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004322 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__sse_x8,
4323 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__sse_x8,
4324 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__sse_x8,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004325 .element_tile = 8,
4326 };
4327 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004328 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__sse_x8,
4329 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__sse_x8,
4330 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__sse_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08004331 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004332 .element_tile = 8,
4333 };
4334 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004335 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__sse_x8,
4336 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__sse_x8,
4337 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__sse_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08004338 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004339 .element_tile = 8,
4340 };
Marat Dukhanf7399262020-06-05 10:58:44 -07004341 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchardc67dd7f2020-07-06 11:23:57 -07004342 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__sse_x8,
4343 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__sse_x8,
4344 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__sse_x8,
Marat Dukhanf7399262020-06-05 10:58:44 -07004345 .element_tile = 8,
4346 };
Marat Dukhan9a88efe2019-12-10 15:54:24 -08004347 }
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004348 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07004349 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07004350 .init.f32 = xnn_init_f32_minmax_sse_params,
Marat Dukhan49e6ee92019-11-06 15:55:29 -08004351 .channel_tile = 4,
4352 .row_tile = 2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004353 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08004354 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -08004355 // Sparse microkernels on x86 currently target only SSE, and on processors
4356 // with AVX ISA dense inference is expected to be faster than sparse.
4357 if (!cpuinfo_has_x86_avx()) {
4358 init_flags |= XNN_INIT_FLAG_CHW_OPT;
4359 }
4360
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004361 xnn_params.f32.spmm = (struct spmm_parameters) {
Frank Barchard4fd38b22020-10-30 17:10:11 -07004362 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__sse,
4363 .mr = 32,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004364 .nr = 1,
4365 };
Erich Elsen5b2e07a2020-06-09 03:27:59 -07004366 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
4367 .ukernel_with_symm_padding =
4368 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2,
4369 .output_channel_tile = 4,
4370 .output_height_tile = 2,
4371 .output_width_tile = 2,
4372 };
Marat Dukhan48976702022-01-10 18:18:04 -08004373 if (cpuinfo_has_x86_ssse3()) {
Frank Barchard0b18cb32020-11-23 10:50:44 -08004374 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
4375 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4_acc2,
Frank Barchard0b18cb32020-11-23 10:50:44 -08004376 .output_width_tile = 4,
4377 .output_height_tile = 2,
4378 };
4379 } else {
4380 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
4381 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2,
Frank Barchard0b18cb32020-11-23 10:50:44 -08004382 .output_width_tile = 4,
4383 .output_height_tile = 2,
4384 };
4385 }
Marat Dukhanbf715f92020-10-23 20:17:00 -07004386 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
4387 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004388 .output_width_tile = 4,
4389 .output_height_tile = 1,
4390 };
Marat Dukhand0503892020-10-30 08:22:04 -07004391 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
4392 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4,
Marat Dukhand0503892020-10-30 08:22:04 -07004393 .output_width_tile = 4,
4394 .output_height_tile = 4,
4395 };
Marat Dukhanccca2142020-10-30 17:32:45 -07004396 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
4397 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4,
Marat Dukhanccca2142020-10-30 17:32:45 -07004398 .output_width_tile = 4,
4399 .output_height_tile = 2,
4400 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07004401 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
4402 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__sse_x4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004403 .channel_tile = 4,
4404 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -07004405 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
Artsiom Ablavatskib3ffd582021-03-31 13:00:08 -07004406 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__sse_p8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07004407 .channel_tile = 1,
Artsiom Ablavatskib3ffd582021-03-31 13:00:08 -07004408 .pixel_tile = 8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07004409 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08004410 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004411 #endif // XNN_NO_F32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07004412
Frank Barchardb40ee632021-12-30 11:10:02 -08004413 /*************************** VCVT x86 micro-kernels ***************************/
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004414 #ifndef XNN_NO_VCVT_OPERATORS
4415 init_flags |= XNN_INIT_FLAG_VCVT;
4416
4417 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
Marat Dukhan134f9842021-12-29 19:57:31 -08004418 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4419 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__avx512skx_x16,
4420 .element_tile = 16,
4421 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08004422 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4423 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__avx512skx_x16,
4424 .element_tile = 16,
4425 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004426 } else if (cpuinfo_has_x86_f16c()) {
Marat Dukhan134f9842021-12-29 19:57:31 -08004427 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4428 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__f16c_x16,
4429 .element_tile = 16,
4430 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08004431 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4432 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__f16c_x16,
4433 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_f16c_params,
4434 .element_tile = 16,
4435 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004436 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhan134f9842021-12-29 19:57:31 -08004437 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4438 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__avx_int16_x16,
4439 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
4440 .element_tile = 16,
4441 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08004442 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4443 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__avx_x24,
4444 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
4445 .element_tile = 24,
4446 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004447 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhan134f9842021-12-29 19:57:31 -08004448 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4449 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__sse41_int16_x16,
4450 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
4451 .element_tile = 16,
4452 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08004453 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4454 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__sse41_x8,
4455 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
4456 .element_tile = 8,
4457 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004458 } else {
Marat Dukhan134f9842021-12-29 19:57:31 -08004459 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4460 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__sse2_int16_x32,
4461 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
4462 .element_tile = 32,
4463 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08004464 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4465 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__sse2_x16,
4466 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
4467 .element_tile = 16,
4468 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004469 }
Marat Dukhan2edf8632021-12-14 23:17:14 -08004470 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
4471 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4472 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__avx512skx_x128,
4473 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx512_params,
4474 .element_tile = 128,
4475 };
4476 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan0d399ca2021-12-14 19:25:50 -08004477 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4478 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__avx2_x64,
4479 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx2_params,
4480 .element_tile = 64,
4481 };
4482 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhanb91432c2021-12-14 16:52:09 -08004483 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4484 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__avx_x32,
4485 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx_params,
4486 .element_tile = 32,
4487 };
4488 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhaned2d7762021-12-03 23:51:19 -08004489 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4490 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__sse41_x32,
4491 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_sse4_params,
4492 .element_tile = 32,
4493 };
4494 } else {
4495 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4496 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__sse2_x32,
4497 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_sse2_params,
4498 .element_tile = 32,
4499 };
4500 }
Marat Dukhan2edf8632021-12-14 23:17:14 -08004501 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
4502 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4503 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__avx512skx_x128,
4504 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx512_params,
4505 .element_tile = 128,
4506 };
4507 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan0d399ca2021-12-14 19:25:50 -08004508 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4509 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__avx2_x64,
4510 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx2_params,
4511 .element_tile = 64,
4512 };
4513 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhanb91432c2021-12-14 16:52:09 -08004514 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4515 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__avx_x32,
4516 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx_params,
4517 .element_tile = 32,
4518 };
4519 } else {
4520 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4521 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__sse2_x32,
4522 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_sse2_params,
4523 .element_tile = 32,
4524 };
4525 }
Marat Dukhan98393ad2021-12-15 11:07:40 -08004526 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
4527 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4528 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx512skx_x32,
4529 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx512_params,
4530 .element_tile = 32,
4531 };
4532 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4533 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__avx512skx_x32,
4534 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx512_params,
4535 .element_tile = 32,
4536 };
4537 } else if (cpuinfo_has_x86_avx2()) {
Marat Dukhan7b5f7792021-12-15 00:29:39 -08004538 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4539 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx2_x16,
4540 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params,
4541 .element_tile = 16,
4542 };
4543 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4544 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__avx2_x16,
4545 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx_params,
4546 .element_tile = 16,
4547 };
4548 } else if (cpuinfo_has_x86_avx()) {
Marat Dukhancd4089f2021-12-14 23:53:33 -08004549 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4550 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx_x32,
4551 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params,
4552 .element_tile = 32,
4553 };
4554 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4555 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__avx_x32,
4556 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx_params,
4557 .element_tile = 32,
4558 };
4559 } else if (cpuinfo_has_x86_sse4_1()) {
Marat Dukhanf92206b2021-12-10 17:02:07 -08004560 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4561 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__sse41_x16,
4562 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_sse4_params,
4563 .element_tile = 16,
4564 };
4565 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4566 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__sse41_x16,
4567 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_sse4_params,
4568 .element_tile = 16,
4569 };
4570 } else {
4571 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4572 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__sse2_x32,
4573 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_sse2_params,
4574 .element_tile = 32,
4575 };
4576 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4577 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__sse2_x32,
4578 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_sse2_params,
4579 .element_tile = 32,
4580 };
4581 }
Marat Dukhanaf2ba002021-10-24 14:21:41 -07004582 #endif // XNN_NO_VCVT_OPERATORS
4583
Frank Barchardb40ee632021-12-30 11:10:02 -08004584 /**************************** X32 x86 micro-kernels ****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004585 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07004586 init_flags |= XNN_INIT_FLAG_X32;
4587
Marat Dukhan57dccd82020-04-14 00:53:10 -07004588 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__sse2;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004589 xnn_params.x32.zip = (struct zip_parameters) {
4590 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__sse2,
4591 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__sse2,
4592 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__sse2,
4593 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__sse2,
4594 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08004595 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -08004596 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
4597 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08004598 .channel_tile = 1,
4599 .pixel_tile = 1,
4600 };
4601 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004602 #endif // XNN_NO_X32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07004603
Frank Barchardb40ee632021-12-30 11:10:02 -08004604 /**************************** XX x86 micro-kernels ****************************/
Marat Dukhan048931b2020-11-24 20:53:54 -08004605 #ifndef XNN_NO_XX_OPERATORS
4606 init_flags |= XNN_INIT_FLAG_XX;
4607
4608 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
Marat Dukhan933051b2021-08-07 16:26:15 -07004609 xnn_params.xx.fill = (struct fill_parameters) {
4610 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__sse2_x64,
4611 .row_tile = 1,
4612 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -07004613 xnn_params.xx.pad = (struct pad_parameters) {
4614 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__sse2,
4615 .row_tile = 1,
4616 };
Marat Dukhan048931b2020-11-24 20:53:54 -08004617 #endif
4618
Marat Dukhan4c617792021-12-21 15:47:58 -08004619#elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
Marat Dukhan933051b2021-08-07 16:26:15 -07004620
Frank Barchardb40ee632021-12-30 11:10:02 -08004621 /**************************** QC8 WAsm SIMD micro-kernels****************************/
Marat Dukhan898d5852021-06-30 21:18:34 -07004622 #ifndef XNN_NO_QS8_OPERATORS
4623 init_flags |= XNN_INIT_FLAG_QC8;
4624
Marat Dukhan189c1d02021-09-03 15:39:54 -07004625 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
Marat Dukhan58cdcf22022-02-01 02:05:00 -08004626 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4627 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4628 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4629 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
Marat Dukhan189c1d02021-09-03 15:39:54 -07004630 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
Marat Dukhan0f1ed942021-12-08 23:25:50 -08004631 xnn_params.qc8.gemm.mr = 4;
Marat Dukhan189c1d02021-09-03 15:39:54 -07004632 xnn_params.qc8.gemm.nr = 4;
4633 xnn_params.qc8.gemm.log2_kr = 1;
Marat Dukhan0f1ed942021-12-08 23:25:50 -08004634 xnn_params.qc8.gemm.log2_sr = 2;
Marat Dukhan189c1d02021-09-03 15:39:54 -07004635 #else
4636 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
4637 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
4638 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
4639 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
4640 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
4641 xnn_params.qc8.gemm.mr = 3;
4642 xnn_params.qc8.gemm.nr = 4;
4643 xnn_params.qc8.gemm.log2_kr = 3;
4644 #endif
Marat Dukhan898d5852021-06-30 21:18:34 -07004645
Marat Dukhan9cedb592021-08-17 17:25:24 -07004646 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16_add16;
Marat Dukhan898d5852021-06-30 21:18:34 -07004647 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
Marat Dukhan9cedb592021-08-17 17:25:24 -07004648 xnn_params.qc8.dwconv[0].channel_tile = 16;
Marat Dukhan898d5852021-06-30 21:18:34 -07004649 xnn_params.qc8.dwconv[0].primary_tile = 9;
Marat Dukhan9cedb592021-08-17 17:25:24 -07004650 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__wasmsimd_mul16_add16;
Marat Dukhan898d5852021-06-30 21:18:34 -07004651 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
Marat Dukhan9cedb592021-08-17 17:25:24 -07004652 xnn_params.qc8.dwconv[1].channel_tile = 16;
Marat Dukhan898d5852021-06-30 21:18:34 -07004653 xnn_params.qc8.dwconv[1].primary_tile = 25;
4654 #endif // XNN_NO_QC8_OPERATORS
4655
Frank Barchardb40ee632021-12-30 11:10:02 -08004656 /**************************** QS8 WAsm SIMD micro-kernels****************************/
Marat Dukhan07e50402020-08-05 17:16:53 -07004657 #ifndef XNN_NO_QS8_OPERATORS
4658 init_flags |= XNN_INIT_FLAG_QS8;
4659
Marat Dukhan189c1d02021-09-03 15:39:54 -07004660 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
Marat Dukhan58cdcf22022-02-01 02:05:00 -08004661 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4662 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4663 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4664 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
Marat Dukhan189c1d02021-09-03 15:39:54 -07004665 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
Marat Dukhan0f1ed942021-12-08 23:25:50 -08004666 xnn_params.qs8.gemm.mr = 4;
Marat Dukhan189c1d02021-09-03 15:39:54 -07004667 xnn_params.qs8.gemm.nr = 4;
4668 xnn_params.qs8.gemm.log2_kr = 1;
Marat Dukhan0f1ed942021-12-08 23:25:50 -08004669 xnn_params.qs8.gemm.log2_sr = 2;
Marat Dukhan189c1d02021-09-03 15:39:54 -07004670 #else // XNN_WASMSIMD_VERSION >= 88
4671 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
4672 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
4673 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
4674 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
4675 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
4676 xnn_params.qs8.gemm.mr = 3;
4677 xnn_params.qs8.gemm.nr = 4;
4678 xnn_params.qs8.gemm.log2_kr = 3;
4679 #endif // XNN_WASMSIMD_VERSION >= 88
Marat Dukhan07e50402020-08-05 17:16:53 -07004680
Marat Dukhan9cedb592021-08-17 17:25:24 -07004681 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16_add16;
Marat Dukhan400e7cb2021-08-07 15:14:54 -07004682 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
Marat Dukhan9cedb592021-08-17 17:25:24 -07004683 xnn_params.qs8.dwconv[0].channel_tile = 16;
Marat Dukhan07e50402020-08-05 17:16:53 -07004684 xnn_params.qs8.dwconv[0].primary_tile = 9;
Marat Dukhan9cedb592021-08-17 17:25:24 -07004685 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__wasmsimd_mul16_add16;
Marat Dukhan400e7cb2021-08-07 15:14:54 -07004686 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
Marat Dukhan9cedb592021-08-17 17:25:24 -07004687 xnn_params.qs8.dwconv[1].channel_tile = 16;
Marat Dukhan4ed14882021-05-12 17:50:40 -07004688 xnn_params.qs8.dwconv[1].primary_tile = 25;
Marat Dukhan9e0b5392020-08-07 02:29:34 -07004689
4690 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan9e258d62022-01-12 10:50:51 -08004691 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16,
4692 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16,
Marat Dukhan53f41062022-01-11 19:44:57 -08004693 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params,
4694 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_wasmsimd_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08004695 .row_tile = 7,
Marat Dukhan9e258d62022-01-12 10:50:51 -08004696 .channel_tile = 16,
Marat Dukhan9e0b5392020-08-07 02:29:34 -07004697 };
Marat Dukhanff209482020-09-03 14:26:53 -07004698
4699 xnn_params.qs8.vadd = (struct vbinary_parameters) {
Marat Dukhane20a8732021-12-07 17:11:37 -08004700 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__wasmsimd_x32,
4701 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x32,
4702 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x32,
Marat Dukhan64287252021-09-07 16:20:03 -07004703 .init.qs8_addsub = xnn_init_qs8_add_minmax_wasmsimd_params,
Marat Dukhane20a8732021-12-07 17:11:37 -08004704 .element_tile = 32,
Marat Dukhanff209482020-09-03 14:26:53 -07004705 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07004706 xnn_params.qs8.vmul = (struct vbinary_parameters) {
4707 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4708 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4709 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4710 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_wasmsimd_params,
4711 .element_tile = 8,
4712 };
Marat Dukhan07e50402020-08-05 17:16:53 -07004713 #endif // XNN_NO_QS8_OPERATORS
4714
Frank Barchardb40ee632021-12-30 11:10:02 -08004715 /**************************** QU8 WAsm SIMD micro-kernels****************************/
Marat Dukhan08b7a972020-07-14 18:17:29 -07004716 #ifndef XNN_NO_QU8_OPERATORS
4717 init_flags |= XNN_INIT_FLAG_QU8;
Marat Dukhan854fb6b2020-06-19 12:33:44 -07004718
Marat Dukhan189c1d02021-09-03 15:39:54 -07004719 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
Marat Dukhan58cdcf22022-02-01 02:05:00 -08004720 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4721 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4722 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4723 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
Marat Dukhan189c1d02021-09-03 15:39:54 -07004724 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
Marat Dukhan58cdcf22022-02-01 02:05:00 -08004725 xnn_params.qu8.gemm.mr = 4;
Marat Dukhan189c1d02021-09-03 15:39:54 -07004726 xnn_params.qu8.gemm.nr = 4;
4727 xnn_params.qu8.gemm.log2_kr = 1;
Marat Dukhan58cdcf22022-02-01 02:05:00 -08004728 xnn_params.qu8.gemm.log2_sr = 2;
Marat Dukhan189c1d02021-09-03 15:39:54 -07004729 #else // XNN_WASMSIMD_VERSION >= 88
4730 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64);
4731 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64);
4732 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64);
4733 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64);
4734 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
4735 xnn_params.qu8.gemm.mr = 3;
4736 xnn_params.qu8.gemm.nr = 4;
4737 xnn_params.qu8.gemm.log2_kr = 3;
4738 #endif // XNN_WASMSIMD_VERSION >= 88
Marat Dukhanaefaef32020-04-09 07:09:34 -07004739
Marat Dukhana97e9752021-07-15 16:30:41 -07004740 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__wasmsimd_mul16;
4741 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
4742 xnn_params.qu8.dwconv[0].channel_tile = 8;
Marat Dukhan08b7a972020-07-14 18:17:29 -07004743 xnn_params.qu8.dwconv[0].primary_tile = 9;
Marat Dukhana97e9752021-07-15 16:30:41 -07004744 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__wasmsimd_mul16;
4745 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
4746 xnn_params.qu8.dwconv[1].channel_tile = 8;
4747 xnn_params.qu8.dwconv[1].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004748
Marat Dukhan08b7a972020-07-14 18:17:29 -07004749 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08004750 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
4751 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
4752 .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
4753 .primary_tile = 9,
4754 .incremental_tile = 8,
4755 .channel_tile = 1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004756 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07004757 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhand1f53e42022-01-12 22:34:51 -08004758 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16,
4759 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16,
4760 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params,
4761 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_wasmsimd_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08004762 .row_tile = 7,
Marat Dukhand1f53e42022-01-12 22:34:51 -08004763 .channel_tile = 16,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004764 };
Marat Dukhandb007cd2021-07-20 23:42:39 -07004765
4766 xnn_params.qu8.vadd = (struct vbinary_parameters) {
Marat Dukhane20a8732021-12-07 17:11:37 -08004767 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__wasmsimd_x32,
4768 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_x32,
4769 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_x32,
Marat Dukhan64287252021-09-07 16:20:03 -07004770 .init.qu8_addsub = xnn_init_qu8_add_minmax_wasmsimd_params,
Marat Dukhane20a8732021-12-07 17:11:37 -08004771 .element_tile = 32,
Marat Dukhandb007cd2021-07-20 23:42:39 -07004772 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07004773 xnn_params.qu8.vmul = (struct vbinary_parameters) {
4774 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4775 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4776 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4777 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_wasmsimd_params,
4778 .element_tile = 8,
4779 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07004780 #endif // XNN_NO_QU8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07004781
Frank Barchardb40ee632021-12-30 11:10:02 -08004782 /**************************** S8 WAsm SIMD micro-kernels****************************/
Marat Dukhandc5c1482021-08-16 09:03:15 -07004783 #ifndef XNN_NO_S8_OPERATORS
4784 init_flags |= XNN_INIT_FLAG_S8;
4785
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07004786 xnn_params.s8.clamp = (struct vunary_parameters) {
4787 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__wasmsimd_x64,
4788 .init.s8_minmax = xnn_init_s8_minmax_wasmsimd_params,
4789 .element_tile = 64,
4790 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08004791 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
4792 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
4793 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__wasmsimd_dot16x2_c8,
4794 .pixel_tile = 1,
4795 .channel_tile = 8,
4796 };
4797 #else // XNN_WASMSIMD_VERSION >= 88
4798 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
4799 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__wasmsimd_mul32_c8,
4800 .pixel_tile = 1,
4801 .channel_tile = 8,
4802 };
4803 #endif // XNN_WASMSIMD_VERSION >= 88
Marat Dukhandc5c1482021-08-16 09:03:15 -07004804 xnn_params.s8.maxpool = (struct maxpool_parameters) {
4805 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__wasmsimd_c16,
4806 .init.s8 = xnn_init_s8_minmax_wasmsimd_params,
4807 .mr = 9,
4808 .qr = 8,
4809 };
4810 #endif // XNN_NO_S8_OPERATORS
4811
Frank Barchardb40ee632021-12-30 11:10:02 -08004812 /**************************** U8 WAsm SIMD micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004813 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07004814 init_flags |= XNN_INIT_FLAG_U8;
4815
Marat Dukhan94912792021-08-16 21:40:30 -07004816 xnn_params.u8.clamp = (struct vunary_parameters) {
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07004817 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__wasmsimd_x64,
4818 .init.u8_minmax = xnn_init_u8_minmax_wasmsimd_params,
4819 .element_tile = 64,
Marat Dukhan94912792021-08-16 21:40:30 -07004820 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08004821 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
4822 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
4823 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__wasmsimd_dot16x2_c8,
4824 .pixel_tile = 1,
4825 .channel_tile = 8,
4826 };
4827 #else // XNN_WASMSIMD_VERSION >= 88
4828 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
4829 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__wasmsimd_mul32_c8,
4830 .pixel_tile = 1,
4831 .channel_tile = 8,
4832 };
4833 #endif // XNN_WASMSIMD_VERSION >= 88
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004834 xnn_params.u8.maxpool = (struct maxpool_parameters) {
Marat Dukhanf1589422021-08-15 20:37:06 -07004835 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__wasmsimd_c16,
4836 .init.u8 = xnn_init_u8_minmax_wasmsimd_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004837 .mr = 9,
4838 .qr = 8,
4839 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004840 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
4841 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
4842 #endif // XNN_NO_U8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07004843
Frank Barchardb40ee632021-12-30 11:10:02 -08004844 /**************************** X8 WAsm SIMD micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004845 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07004846 init_flags |= XNN_INIT_FLAG_X8;
4847
Marat Dukhand67539d2021-09-08 23:06:03 -07004848 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004849 xnn_params.x8.zip = (struct zip_parameters) {
4850 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
4851 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
4852 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
4853 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
4854 };
4855 #endif // XNN_NO_X8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07004856
Frank Barchardb40ee632021-12-30 11:10:02 -08004857 /**************************** F32 WAsm SIMD micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004858 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07004859 init_flags |= XNN_INIT_FLAG_F32;
4860
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004861 if (is_wasm_x86) {
Frank Barchard0725b8d2020-12-07 11:07:35 -08004862 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
4863 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
4864 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
4865 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
Marat Dukhan688f6d82020-07-14 17:02:11 -07004866 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat);
4867 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat);
4868 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
4869 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
Marat Dukhan802808c2020-06-16 11:01:17 -07004870 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__wasmsimd_splat);
4871 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__wasmsimd_splat);
4872 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
4873 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004874 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004875 xnn_params.f32.gemm.mr = 4;
4876 xnn_params.f32.gemm.nr = 8;
Marat Dukhane39e6462020-07-09 01:33:36 -07004877
4878 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
4879 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
4880 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
4881 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004882 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhane39e6462020-07-09 01:33:36 -07004883 xnn_params.f32.gemm2.mr = 4;
4884 xnn_params.f32.gemm2.nr = 2;
4885 xnn_params.f32.gemm2.log2_kr = 2;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004886 } else {
Frank Barchard0725b8d2020-12-07 11:07:35 -08004887 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
4888 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
4889 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
4890 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
Marat Dukhan688f6d82020-07-14 17:02:11 -07004891 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat);
4892 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_splat);
4893 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
4894 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
Marat Dukhan802808c2020-06-16 11:01:17 -07004895 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_5x8__wasmsimd_splat);
4896 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_5x8__wasmsimd_splat);
4897 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
4898 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004899 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhan1bbf96b2020-06-15 23:01:20 -07004900 xnn_params.f32.gemm.mr = 5;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004901 xnn_params.f32.gemm.nr = 8;
Marat Dukhane39e6462020-07-09 01:33:36 -07004902
4903 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
4904 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
4905 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
4906 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004907 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhane39e6462020-07-09 01:33:36 -07004908 xnn_params.f32.gemm2.mr = 4;
4909 xnn_params.f32.gemm2.nr = 2;
4910 xnn_params.f32.gemm2.log2_kr = 2;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07004911 }
Marat Dukhanaefaef32020-04-09 07:09:34 -07004912
Marat Dukhanac014d72020-06-16 08:36:47 -07004913 if (is_wasm_x86) {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004914 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__wasmsimd_x86;
4915 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x3__wasmsimd;
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004916 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhanac014d72020-06-16 08:36:47 -07004917 xnn_params.f32.dwconv[0].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004918 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004919
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004920 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86;
4921 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x4__wasmsimd;
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004922 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhanac014d72020-06-16 08:36:47 -07004923 xnn_params.f32.dwconv[1].channel_tile = 8;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004924 xnn_params.f32.dwconv[1].primary_tile = 4;
4925
4926 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__wasmsimd_x86;
4927 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x9__wasmsimd;
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004928 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004929 xnn_params.f32.dwconv[2].channel_tile = 8;
4930 xnn_params.f32.dwconv[2].primary_tile = 9;
Marat Dukhanac014d72020-06-16 08:36:47 -07004931 } else {
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004932 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x3__wasmsimd_arm;
4933 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x3__wasmsimd;
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004934 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhanac014d72020-06-16 08:36:47 -07004935 xnn_params.f32.dwconv[0].channel_tile = 4;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004936 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004937
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004938 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_arm;
4939 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x4__wasmsimd;
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004940 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Marat Dukhanac014d72020-06-16 08:36:47 -07004941 xnn_params.f32.dwconv[1].channel_tile = 4;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004942 xnn_params.f32.dwconv[1].primary_tile = 4;
4943
4944 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__wasmsimd_arm;
4945 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x9__wasmsimd;
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004946 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004947 xnn_params.f32.dwconv[2].channel_tile = 4;
4948 xnn_params.f32.dwconv[2].primary_tile = 9;
Marat Dukhanac014d72020-06-16 08:36:47 -07004949 }
4950
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004951 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x25__wasmsimd_arm;
4952 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x25__wasmsimd;
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08004953 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07004954 xnn_params.f32.dwconv[3].channel_tile = 4;
4955 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07004956
Marat Dukhan3b7432d2020-07-16 17:46:32 -07004957 if (is_wasm_x86) {
4958 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08004959 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_x86_c4,
4960 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
Marat Dukhan84598222022-01-09 21:39:22 -08004961 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08004962 .primary_tile = 9,
4963 .incremental_tile = 8,
4964 .channel_tile = 4,
Marat Dukhan3b7432d2020-07-16 17:46:32 -07004965 };
Marat Dukhan1483c532020-07-16 18:08:19 -07004966 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08004967 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_x86_c4,
4968 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
4969 .primary_tile = 9,
4970 .incremental_tile = 8,
4971 .channel_tile = 4,
Marat Dukhan1483c532020-07-16 18:08:19 -07004972 };
Marat Dukhanc6016802020-07-16 18:51:28 -07004973 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08004974 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4,
4975 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4,
Marat Dukhan84598222022-01-09 21:39:22 -08004976 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
4977 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08004978 .row_tile = 7,
4979 .channel_tile = 4,
Marat Dukhanc6016802020-07-16 18:51:28 -07004980 };
Marat Dukhan3b7432d2020-07-16 17:46:32 -07004981 } else {
4982 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08004983 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_arm_c4,
4984 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
Marat Dukhan84598222022-01-09 21:39:22 -08004985 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08004986 .primary_tile = 9,
4987 .incremental_tile = 8,
4988 .channel_tile = 4,
Marat Dukhan3b7432d2020-07-16 17:46:32 -07004989 };
Marat Dukhan1483c532020-07-16 18:08:19 -07004990 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08004991 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_arm_c4,
4992 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
4993 .primary_tile = 9,
4994 .incremental_tile = 8,
4995 .channel_tile = 4,
Marat Dukhan1483c532020-07-16 18:08:19 -07004996 };
Marat Dukhanc6016802020-07-16 18:51:28 -07004997 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08004998 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4,
4999 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4,
Marat Dukhan84598222022-01-09 21:39:22 -08005000 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5001 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08005002 .row_tile = 7,
5003 .channel_tile = 4,
Marat Dukhanc6016802020-07-16 18:51:28 -07005004 };
Marat Dukhan3b7432d2020-07-16 17:46:32 -07005005 }
Marat Dukhanf6e24802020-07-08 22:20:40 -07005006 if (is_wasm_x86) {
5007 xnn_params.f32.maxpool = (struct maxpool_parameters) {
5008 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08005009 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
Marat Dukhanf6e24802020-07-08 22:20:40 -07005010 .mr = 9,
5011 .qr = 8,
5012 };
5013 } else {
5014 xnn_params.f32.maxpool = (struct maxpool_parameters) {
5015 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08005016 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
Marat Dukhanf6e24802020-07-08 22:20:40 -07005017 .mr = 9,
5018 .qr = 8,
5019 };
5020 }
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005021 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
Marat Dukhan40f05522020-07-16 22:33:12 -07005022 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__wasmsimd_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005023 .mr = 4,
5024 };
5025 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
Marat Dukhan40f05522020-07-16 22:33:12 -07005026 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__wasmsimd_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005027 .mr = 9,
5028 };
5029 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
Marat Dukhan40f05522020-07-16 22:33:12 -07005030 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__wasmsimd_c4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005031 .mr = 9,
5032 .qr = 8,
5033 };
Marat Dukhan660fd192020-03-10 04:55:30 -07005034 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
Marat Dukhan00d1d6e2020-07-09 01:37:27 -07005035 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__wasmsimd_c8,
Marat Dukhan69722492019-11-11 19:55:50 -08005036 .pixel_tile = 1,
5037 .channel_tile = 8,
5038 };
Marat Dukhane5efb162021-12-31 10:26:13 -08005039 xnn_params.f32.abs = (struct vunary_parameters) {
5040 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__wasmsimd_x8,
5041 .init.f32_abs = xnn_init_f32_abs_wasmsimd_params,
5042 .element_tile = 16,
5043 };
Marat Dukhanc303fe62020-06-26 10:09:25 -07005044 if (is_wasm_x86) {
Marat Dukhan94912792021-08-16 21:40:30 -07005045 xnn_params.f32.clamp = (struct vunary_parameters) {
5046 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__wasmsimd_x86_x8,
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08005047 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Marat Dukhan94912792021-08-16 21:40:30 -07005048 .element_tile = 8,
5049 };
Marat Dukhanc303fe62020-06-26 10:09:25 -07005050 } else {
Marat Dukhan94912792021-08-16 21:40:30 -07005051 xnn_params.f32.clamp = (struct vunary_parameters) {
5052 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__wasmsimd_arm_x8,
Marat Dukhanc83ef3b2021-12-30 09:47:07 -08005053 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Marat Dukhan94912792021-08-16 21:40:30 -07005054 .element_tile = 8,
5055 };
Marat Dukhanc303fe62020-06-26 10:09:25 -07005056 }
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08005057 if (is_wasm_x86) {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08005058 xnn_params.f32.elu = (struct vunary_parameters) {
5059 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x20,
5060 .init.f32_elu = xnn_init_f32_elu_wasmsimd_rr2_p6_params,
5061 .element_tile = 20,
5062 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08005063 } else {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08005064 xnn_params.f32.elu = (struct vunary_parameters) {
5065 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x20,
5066 .init.f32_elu = xnn_init_f32_elu_wasmsimd_rr2_p6_params,
5067 .element_tile = 20,
5068 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08005069 }
Marat Dukhan561d0682021-12-23 16:12:35 -08005070 xnn_params.f32.hswish = (struct vunary_parameters) {
5071 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__wasmsimd_x16,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08005072 .init.f32_hswish = xnn_init_f32_hswish_wasmsimd_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08005073 .element_tile = 16,
5074 };
Marat Dukhanf4935a22020-07-16 15:59:10 -07005075 if (is_wasm_x86) {
Marat Dukhan2894e992021-12-30 08:29:48 -08005076 xnn_params.f32.lrelu = (struct vunary_parameters) {
5077 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__wasmsimd_minmax_x8,
5078 .init.f32_lrelu = xnn_init_f32_lrelu_wasmsimd_params,
5079 .element_tile = 8,
5080 };
Marat Dukhanf4935a22020-07-16 15:59:10 -07005081 } else {
Marat Dukhan2894e992021-12-30 08:29:48 -08005082 xnn_params.f32.lrelu = (struct vunary_parameters) {
5083 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__wasmsimd_bitselect_x8,
5084 .init.f32_lrelu = xnn_init_f32_lrelu_wasmsimd_params,
5085 .element_tile = 8,
5086 };
Marat Dukhanf4935a22020-07-16 15:59:10 -07005087 }
Marat Dukhane5efb162021-12-31 10:26:13 -08005088 xnn_params.f32.neg = (struct vunary_parameters) {
5089 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__wasmsimd_x8,
5090 .init.f32_neg = xnn_init_f32_neg_wasmsimd_params,
5091 .element_tile = 16,
5092 };
Marat Dukhan6674d692021-05-05 22:27:00 -07005093 xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_vrelu_ukernel__wasmsimd_x16;
Marat Dukhan189c1d02021-09-03 15:39:54 -07005094 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 91)
Marat Dukhan0e801372022-01-04 00:10:41 -08005095 xnn_params.f32.rndne = (struct vunary_parameters) {
5096 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__wasmsimd_native_x8,
5097 .element_tile = 8,
5098 };
5099 xnn_params.f32.rndz = (struct vunary_parameters) {
5100 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_native_x8,
5101 .element_tile = 8,
5102 };
5103 xnn_params.f32.rndu = (struct vunary_parameters) {
5104 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__wasmsimd_native_x8,
5105 .element_tile = 8,
5106 };
5107 xnn_params.f32.rndd = (struct vunary_parameters) {
5108 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__wasmsimd_native_x8,
5109 .element_tile = 8,
5110 };
Marat Dukhan189c1d02021-09-03 15:39:54 -07005111 #else // XNN_WASMSIMD_VERSION >= 91
Marat Dukhan0e801372022-01-04 00:10:41 -08005112 xnn_params.f32.rndne = (struct vunary_parameters) {
5113 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__wasmsimd_addsub_x8,
5114 .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
5115 .element_tile = 8,
5116 };
Marat Dukhan189c1d02021-09-03 15:39:54 -07005117 if (is_wasm_x86) {
Marat Dukhan0e801372022-01-04 00:10:41 -08005118 xnn_params.f32.rndz = (struct vunary_parameters) {
5119 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_addsub_x8,
5120 .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
5121 .element_tile = 8,
5122 };
Marat Dukhan189c1d02021-09-03 15:39:54 -07005123 } else {
Marat Dukhan0e801372022-01-04 00:10:41 -08005124 xnn_params.f32.rndz = (struct vunary_parameters) {
5125 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_cvt_x8,
5126 .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
5127 .element_tile = 8,
5128 };
Marat Dukhan189c1d02021-09-03 15:39:54 -07005129 }
Marat Dukhan0e801372022-01-04 00:10:41 -08005130 xnn_params.f32.rndu = (struct vunary_parameters) {
5131 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__wasmsimd_addsub_x8,
5132 .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
5133 .element_tile = 8,
5134 };
5135 xnn_params.f32.rndd = (struct vunary_parameters) {
5136 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__wasmsimd_addsub_x8,
5137 .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
5138 .element_tile = 8,
5139 };
Marat Dukhan189c1d02021-09-03 15:39:54 -07005140 #endif // XNN_WASMSIMD_VERSION >= 91
Marat Dukhance834ad2022-01-03 00:22:01 -08005141 xnn_params.f32.sigmoid = (struct vunary_parameters) {
5142 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_p5_div_x16,
5143 .init.f32_sigmoid = xnn_init_f32_sigmoid_wasmsimd_rr2_p5_params,
5144 .element_tile = 16,
5145 };
Marat Dukhane5efb162021-12-31 10:26:13 -08005146 xnn_params.f32.sqr = (struct vunary_parameters) {
5147 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__wasmsimd_x8,
5148 .element_tile = 16,
5149 };
Marat Dukhane72b2822021-12-30 14:46:58 -08005150 xnn_params.f32.sqrt = (struct vunary_parameters) {
5151 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_x8,
5152 .element_tile = 8,
5153 };
Marat Dukhan195f8eb2020-06-25 12:50:57 -07005154 if (is_wasm_x86) {
5155 xnn_params.f32.prelu = (struct prelu_parameters) {
Marat Dukhan78299282020-07-15 17:38:06 -07005156 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasmsimd_minmax_2x8,
Marat Dukhan195f8eb2020-06-25 12:50:57 -07005157 .row_tile = 2,
5158 .channel_tile = 8,
5159 };
5160 } else {
5161 xnn_params.f32.prelu = (struct prelu_parameters) {
Marat Dukhan78299282020-07-15 17:38:06 -07005162 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasmsimd_bitselect_2x8,
Marat Dukhan195f8eb2020-06-25 12:50:57 -07005163 .row_tile = 2,
5164 .channel_tile = 8,
5165 };
5166 }
Marat Dukhan4a5c7712022-01-05 22:43:13 -08005167 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
5168 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_x16_acc2,
5169 .init = xnn_init_f32_expminus_wasmsimd_rr2_p5_params,
5170 .element_tile = 16,
5171 };
Marat Dukhancdc56552020-06-26 19:49:41 -07005172 if (is_wasm_x86) {
Marat Dukhan0bf8afa2021-09-20 10:02:18 -07005173 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__wasmsimd_x86;
Marat Dukhancdc56552020-06-26 19:49:41 -07005174 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005175 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_x16,
5176 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16,
5177 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16,
5178 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x16,
5179 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
5180 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08005181 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchard9c7308f2020-08-31 17:03:01 -07005182 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005183 };
5184 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardb392f8e2020-10-27 10:46:44 -07005185 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16,
5186 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x16,
5187 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x16,
5188 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__wasmsimd_x16,
5189 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__wasmsimd_x16,
5190 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08005191 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchardb392f8e2020-10-27 10:46:44 -07005192 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005193 };
5194 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005195 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_x86_x16,
5196 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16,
5197 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16,
5198 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005199 };
5200 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005201 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_x86_x16,
5202 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x16,
5203 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x16,
Frank Barchard9c7308f2020-08-31 17:03:01 -07005204 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005205 };
5206 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005207 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16,
5208 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16,
5209 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16,
5210 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x16,
5211 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
5212 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08005213 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchard9c7308f2020-08-31 17:03:01 -07005214 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005215 };
5216 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005217 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16,
5218 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x16,
5219 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x16,
5220 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x16,
5221 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x16,
5222 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08005223 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchard9c7308f2020-08-31 17:03:01 -07005224 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005225 };
5226 } else {
Marat Dukhan0bf8afa2021-09-20 10:02:18 -07005227 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__wasmsimd_arm;
Marat Dukhancdc56552020-06-26 19:49:41 -07005228 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005229 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_x16,
5230 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16,
5231 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16,
5232 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x16,
5233 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
5234 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08005235 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchard9c7308f2020-08-31 17:03:01 -07005236 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005237 };
5238 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardb392f8e2020-10-27 10:46:44 -07005239 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16,
5240 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x16,
5241 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x16,
5242 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__wasmsimd_x16,
5243 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__wasmsimd_x16,
5244 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08005245 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchardb392f8e2020-10-27 10:46:44 -07005246 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005247 };
5248 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005249 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_arm_x16,
5250 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16,
5251 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16,
5252 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005253 };
5254 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005255 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_arm_x16,
5256 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x16,
5257 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x16,
5258 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005259 };
5260 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005261 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16,
5262 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16,
5263 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16,
5264 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x16,
5265 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
5266 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08005267 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchard9c7308f2020-08-31 17:03:01 -07005268 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005269 };
5270 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005271 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16,
5272 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x16,
5273 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x16,
5274 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x16,
5275 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x16,
5276 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x16,
Marat Dukhanf6004972021-12-30 11:23:02 -08005277 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
Frank Barchard9c7308f2020-08-31 17:03:01 -07005278 .element_tile = 16,
Marat Dukhancdc56552020-06-26 19:49:41 -07005279 };
5280 }
Marat Dukhanf7399262020-06-05 10:58:44 -07005281 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005282 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__wasmsimd_x16,
5283 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16,
5284 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16,
5285 .element_tile = 16,
Marat Dukhanf7399262020-06-05 10:58:44 -07005286 };
Marat Dukhand816f622020-07-15 10:14:39 -07005287 if (is_wasm_x86) {
5288 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07005289 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_x86_2x,
Marat Dukhand57186a2021-12-30 11:37:24 -08005290 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
Marat Dukhand816f622020-07-15 10:14:39 -07005291 .channel_tile = 4,
5292 .row_tile = 2,
5293 };
5294 } else {
5295 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07005296 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_arm_2x,
Marat Dukhand57186a2021-12-30 11:37:24 -08005297 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
Marat Dukhand816f622020-07-15 10:14:39 -07005298 .channel_tile = 4,
5299 .row_tile = 2,
5300 };
5301 }
Erich Elsen6e80fdc2020-06-09 15:35:37 -07005302 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -08005303 init_flags |= XNN_INIT_FLAG_CHW_OPT;
5304
Frank Barchard498cb502020-11-16 23:50:04 -08005305 if (is_wasm_x86) {
5306 xnn_params.f32.spmm = (struct spmm_parameters) {
5307 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86,
5308 .mr = 32,
5309 .nr = 1,
5310 };
5311 } else {
5312 xnn_params.f32.spmm = (struct spmm_parameters) {
5313 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm,
5314 .mr = 32,
5315 .nr = 1,
5316 };
5317 }
Erich Elsen0a1970e2020-06-10 09:24:59 -07005318 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
5319 .ukernel_with_symm_padding =
Frank Barchard22136062020-11-24 18:44:46 -08005320 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2,
Erich Elsen0a1970e2020-06-10 09:24:59 -07005321 .output_channel_tile = 4,
5322 .output_height_tile = 2,
5323 .output_width_tile = 2,
5324 };
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005325 if (is_wasm_x86) {
5326 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08005327 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005328 .output_width_tile = 4,
Frank Barchard97883b82020-11-23 13:01:03 -08005329 .output_height_tile = 2,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005330 };
5331 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08005332 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_1x4_acc2,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005333 .output_width_tile = 4,
5334 .output_height_tile = 1,
5335 };
5336 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08005337 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005338 .output_width_tile = 4,
5339 .output_height_tile = 3,
5340 };
5341 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08005342 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005343 .output_width_tile = 4,
5344 .output_height_tile = 1,
5345 };
5346 } else {
5347 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08005348 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005349 .output_width_tile = 4,
Frank Barchard97883b82020-11-23 13:01:03 -08005350 .output_height_tile = 2,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005351 };
5352 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08005353 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_1x4_acc4,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005354 .output_width_tile = 4,
5355 .output_height_tile = 1,
5356 };
5357 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08005358 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005359 .output_width_tile = 4,
5360 .output_height_tile = 3,
5361 };
5362 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
Frank Barchard412e2f42020-12-11 11:40:50 -08005363 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2,
Frank Barcharddb5c32d2020-11-16 23:58:42 -08005364 .output_width_tile = 4,
5365 .output_height_tile = 1,
5366 };
5367 }
Marat Dukhanc5045bf2020-07-27 18:16:35 -07005368 if (is_wasm_x86) {
5369 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
5370 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4,
5371 .channel_tile = 4,
5372 };
5373 } else {
5374 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
5375 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4,
5376 .channel_tile = 4,
5377 };
5378 }
Artsiom Ablavatski97918102020-10-27 15:52:59 -07005379 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
5380 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8,
5381 .channel_tile = 1,
Artsiom Ablavatskib3ffd582021-03-31 13:00:08 -07005382 .pixel_tile = 8,
Artsiom Ablavatski97918102020-10-27 15:52:59 -07005383 };
Erich Elsen6e80fdc2020-06-09 15:35:37 -07005384 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005385 #endif // XNN_NO_F32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07005386
Frank Barchardb40ee632021-12-30 11:10:02 -08005387 /*************************** VCVT WAsm SIMD micro-kernels***************************/
Marat Dukhanaf2ba002021-10-24 14:21:41 -07005388 #ifndef XNN_NO_VCVT_OPERATORS
5389 init_flags |= XNN_INIT_FLAG_VCVT;
5390
Marat Dukhan134f9842021-12-29 19:57:31 -08005391 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
5392 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x16,
5393 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_wasmsimd_int16_params,
5394 .element_tile = 16,
5395 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08005396 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
5397 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__wasmsimd_x24,
5398 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_wasmsimd_params,
5399 .element_tile = 24,
5400 };
Marat Dukhand52d20b2021-12-05 09:50:25 -08005401 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
5402 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__wasmsimd_magic_x32,
5403 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_wasmsimd_magic_params,
5404 .element_tile = 32,
5405 };
5406 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
5407 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_x32,
5408 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_wasmsimd_magic_params,
5409 .element_tile = 32,
5410 };
Marat Dukhanf92206b2021-12-10 17:02:07 -08005411 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
5412 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__wasmsimd_x32,
5413 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_wasmsimd_params,
5414 .element_tile = 32,
5415 };
5416 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
5417 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__wasmsimd_x32,
5418 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_wasmsimd_params,
5419 .element_tile = 32,
5420 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07005421 #endif // XNN_NO_VCVT_OPERATORS
5422
Frank Barchardb40ee632021-12-30 11:10:02 -08005423 /**************************** X32 WAsm SIMD micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005424 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07005425 init_flags |= XNN_INIT_FLAG_X32;
5426
Marat Dukhan9d4bfa22020-07-16 19:07:04 -07005427 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__wasmsimd;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005428 xnn_params.x32.zip = (struct zip_parameters) {
Marat Dukhane3b78762020-07-16 20:02:58 -07005429 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__wasmsimd,
5430 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__wasmsimd,
5431 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__wasmsimd,
5432 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__wasmsimd,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005433 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08005434 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -08005435 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
5436 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08005437 .channel_tile = 1,
5438 .pixel_tile = 1,
5439 };
5440 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005441 #endif // XNN_NO_X32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07005442
Frank Barchardb40ee632021-12-30 11:10:02 -08005443 /**************************** XX WAsm SIMD micro-kernels****************************/
Marat Dukhan048931b2020-11-24 20:53:54 -08005444 #ifndef XNN_NO_XX_OPERATORS
5445 init_flags |= XNN_INIT_FLAG_XX;
5446
5447 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
Marat Dukhan933051b2021-08-07 16:26:15 -07005448 xnn_params.xx.fill = (struct fill_parameters) {
5449 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__wasmsimd_x64,
5450 .row_tile = 1,
5451 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -07005452 xnn_params.xx.pad = (struct pad_parameters) {
5453 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__wasmsimd,
5454 .row_tile = 1,
5455 };
Marat Dukhan048931b2020-11-24 20:53:54 -08005456 #endif
5457
Marat Dukhan933051b2021-08-07 16:26:15 -07005458#elif XNN_ARCH_WASM
5459
Frank Barchardb40ee632021-12-30 11:10:02 -08005460 /**************************** QC8 WAsm micro-kernels****************************/
Marat Dukhan898d5852021-06-30 21:18:34 -07005461 #ifndef XNN_NO_QC8_OPERATORS
5462 init_flags |= XNN_INIT_FLAG_QC8;
5463
5464 if (is_wasm_x86) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005465 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5466 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5467 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5468 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5469 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_scalar_imagic_params;
Marat Dukhan898d5852021-06-30 21:18:34 -07005470 xnn_params.qc8.gemm.mr = 2;
5471 xnn_params.qc8.gemm.nr = 2;
5472 } else {
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005473 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5474 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5475 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5476 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
Marat Dukhan2ac722e2022-01-04 01:54:20 -08005477 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_scalar_fmagic_params;
Marat Dukhan898d5852021-06-30 21:18:34 -07005478 xnn_params.qc8.gemm.mr = 4;
5479 xnn_params.qc8.gemm.nr = 4;
5480 }
5481
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005482 if (is_wasm_x86) {
5483 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic;
5484 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_scalar_imagic_params;
5485 xnn_params.qc8.dwconv[0].channel_tile = 2;
5486 xnn_params.qc8.dwconv[0].primary_tile = 9;
5487 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up1x25__scalar_imagic;
5488 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_scalar_imagic_params;
5489 xnn_params.qc8.dwconv[1].channel_tile = 1;
5490 xnn_params.qc8.dwconv[1].primary_tile = 25;
5491 } else {
5492 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic;
5493 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_scalar_fmagic_params;
5494 xnn_params.qc8.dwconv[0].channel_tile = 2;
5495 xnn_params.qc8.dwconv[0].primary_tile = 9;
5496 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x25__wasm_fmagic;
5497 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_scalar_fmagic_params;
5498 xnn_params.qc8.dwconv[1].channel_tile = 2;
5499 xnn_params.qc8.dwconv[1].primary_tile = 25;
5500 }
Marat Dukhan898d5852021-06-30 21:18:34 -07005501 #endif // XNN_NO_QC8_OPERATORS
5502
Frank Barchardb40ee632021-12-30 11:10:02 -08005503 /**************************** QS8 WAsm micro-kernels****************************/
Marat Dukhan803c1f82021-05-12 00:13:37 -07005504 #ifndef XNN_NO_QS8_OPERATORS
5505 init_flags |= XNN_INIT_FLAG_QS8;
5506
5507 if (is_wasm_x86) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005508 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5509 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5510 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5511 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5512 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
Marat Dukhan803c1f82021-05-12 00:13:37 -07005513 xnn_params.qs8.gemm.mr = 2;
5514 xnn_params.qs8.gemm.nr = 2;
5515 } else {
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005516 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5517 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5518 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5519 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
Marat Dukhan2ac722e2022-01-04 01:54:20 -08005520 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan803c1f82021-05-12 00:13:37 -07005521 xnn_params.qs8.gemm.mr = 4;
5522 xnn_params.qs8.gemm.nr = 4;
5523 }
5524
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005525 if (is_wasm_x86) {
5526 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic;
5527 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
5528 xnn_params.qs8.dwconv[0].channel_tile = 2;
5529 xnn_params.qs8.dwconv[0].primary_tile = 9;
5530 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up1x25__scalar_imagic;
5531 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
5532 xnn_params.qs8.dwconv[1].channel_tile = 1;
5533 xnn_params.qs8.dwconv[1].primary_tile = 25;
5534 } else {
5535 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic;
5536 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
5537 xnn_params.qs8.dwconv[0].channel_tile = 2;
5538 xnn_params.qs8.dwconv[0].primary_tile = 9;
5539 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x25__wasm_fmagic;
5540 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
5541 xnn_params.qs8.dwconv[1].channel_tile = 2;
5542 xnn_params.qs8.dwconv[1].primary_tile = 25;
5543 }
Marat Dukhan803c1f82021-05-12 00:13:37 -07005544
5545 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan847ff5e2022-01-11 20:31:06 -08005546 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4,
5547 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4,
Marat Dukhan53f41062022-01-11 19:44:57 -08005548 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
5549 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08005550 .row_tile = 7,
5551 .channel_tile = 4,
Marat Dukhan803c1f82021-05-12 00:13:37 -07005552 };
5553
5554 xnn_params.qs8.vadd = (struct vbinary_parameters) {
5555 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x4,
5556 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
5557 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
Marat Dukhan64287252021-09-07 16:20:03 -07005558 .init.qs8_addsub = xnn_init_qs8_add_minmax_scalar_params,
Marat Dukhan803c1f82021-05-12 00:13:37 -07005559 .element_tile = 4,
5560 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07005561 xnn_params.qs8.vmul = (struct vbinary_parameters) {
5562 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4,
5563 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
5564 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
5565 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params,
5566 .element_tile = 4,
5567 };
Marat Dukhan803c1f82021-05-12 00:13:37 -07005568 #endif // XNN_NO_QS8_OPERATORS
5569
Frank Barchardb40ee632021-12-30 11:10:02 -08005570 /**************************** QU8 WAsm micro-kernels****************************/
Marat Dukhan08b7a972020-07-14 18:17:29 -07005571 #ifndef XNN_NO_QU8_OPERATORS
5572 init_flags |= XNN_INIT_FLAG_QU8;
Marat Dukhan854fb6b2020-06-19 12:33:44 -07005573
Marat Dukhan3d76e552021-07-15 18:54:01 -07005574 if (is_wasm_x86) {
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005575 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5576 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5577 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5578 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5579 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
Marat Dukhan3d76e552021-07-15 18:54:01 -07005580 xnn_params.qu8.gemm.mr = 2;
5581 xnn_params.qu8.gemm.nr = 2;
5582 } else {
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005583 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5584 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5585 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5586 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
Marat Dukhan2ac722e2022-01-04 01:54:20 -08005587 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
Marat Dukhan3d76e552021-07-15 18:54:01 -07005588 xnn_params.qu8.gemm.mr = 4;
5589 xnn_params.qu8.gemm.nr = 4;
5590 }
Marat Dukhanaefaef32020-04-09 07:09:34 -07005591
Marat Dukhan7c1115f2022-01-04 17:18:41 -08005592 if (is_wasm_x86) {
5593 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic;
5594 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
5595 xnn_params.qu8.dwconv[0].channel_tile = 2;
5596 xnn_params.qu8.dwconv[0].primary_tile = 9;
5597 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up1x25__scalar_imagic;
5598 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
5599 xnn_params.qu8.dwconv[1].channel_tile = 1;
5600 xnn_params.qu8.dwconv[1].primary_tile = 25;
5601 } else {
5602 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic;
5603 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
5604 xnn_params.qu8.dwconv[0].channel_tile = 2;
5605 xnn_params.qu8.dwconv[0].primary_tile = 9;
5606 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x25__wasm_fmagic;
5607 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
5608 xnn_params.qu8.dwconv[1].channel_tile = 2;
5609 xnn_params.qu8.dwconv[1].primary_tile = 25;
5610 }
Marat Dukhanaefaef32020-04-09 07:09:34 -07005611
Marat Dukhan08b7a972020-07-14 18:17:29 -07005612 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08005613 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
5614 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
5615 .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
5616 .primary_tile = 9,
5617 .incremental_tile = 8,
5618 .channel_tile = 1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005619 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07005620 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhand1f53e42022-01-12 22:34:51 -08005621 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4,
5622 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4,
5623 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
5624 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08005625 .row_tile = 7,
Marat Dukhand1f53e42022-01-12 22:34:51 -08005626 .channel_tile = 4,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005627 };
Marat Dukhandb007cd2021-07-20 23:42:39 -07005628
5629 xnn_params.qu8.vadd = (struct vbinary_parameters) {
5630 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x4,
5631 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
5632 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
Marat Dukhan64287252021-09-07 16:20:03 -07005633 .init.qu8_addsub = xnn_init_qu8_add_minmax_scalar_params,
Marat Dukhandb007cd2021-07-20 23:42:39 -07005634 .element_tile = 4,
5635 };
Marat Dukhan0853b8a2021-08-03 01:01:53 -07005636 xnn_params.qu8.vmul = (struct vbinary_parameters) {
5637 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4,
5638 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
5639 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
5640 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params,
5641 .element_tile = 4,
5642 };
Marat Dukhan08b7a972020-07-14 18:17:29 -07005643 #endif // XNN_NO_QU8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07005644
Frank Barchardb40ee632021-12-30 11:10:02 -08005645 /**************************** S8 WAsm micro-kernels****************************/
Marat Dukhandc5c1482021-08-16 09:03:15 -07005646 #ifndef XNN_NO_S8_OPERATORS
5647 init_flags |= XNN_INIT_FLAG_S8;
5648
Marat Dukhan61c0c9e2021-08-16 23:16:14 -07005649 xnn_params.s8.clamp = (struct vunary_parameters) {
5650 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__scalar_x4,
5651 .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
5652 .element_tile = 4,
5653 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08005654 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
5655 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__scalar_c1,
5656 .pixel_tile = 1,
5657 .channel_tile = 1,
5658 };
Marat Dukhandc5c1482021-08-16 09:03:15 -07005659 xnn_params.s8.maxpool = (struct maxpool_parameters) {
5660 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
5661 .init.s8 = xnn_init_s8_minmax_scalar_params,
5662 .mr = 9,
5663 .qr = 8,
5664 };
5665 #endif // XNN_NO_S8_OPERATORS
5666
Frank Barchardb40ee632021-12-30 11:10:02 -08005667 /**************************** U8 WAsm micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005668 #ifndef XNN_NO_U8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07005669 init_flags |= XNN_INIT_FLAG_U8;
5670
Marat Dukhan94912792021-08-16 21:40:30 -07005671 xnn_params.u8.clamp = (struct vunary_parameters) {
5672 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
5673 .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
5674 .element_tile = 4,
5675 };
Marat Dukhan24abe6b2021-11-24 15:28:57 -08005676 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
5677 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__scalar_c1,
5678 .pixel_tile = 1,
5679 .channel_tile = 1,
5680 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005681 xnn_params.u8.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07005682 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -07005683 .init.u8 = xnn_init_u8_minmax_scalar_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005684 .mr = 9,
5685 .qr = 8,
5686 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005687 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
5688 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
5689 #endif // XNN_NO_U8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07005690
Frank Barchardb40ee632021-12-30 11:10:02 -08005691 /**************************** X8 WAsm micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005692 #ifndef XNN_NO_X8_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07005693 init_flags |= XNN_INIT_FLAG_X8;
5694
Marat Dukhand67539d2021-09-08 23:06:03 -07005695 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005696 xnn_params.x8.zip = (struct zip_parameters) {
5697 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
5698 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
5699 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
5700 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
5701 };
5702 #endif // XNN_NO_X8_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07005703
Frank Barchardb40ee632021-12-30 11:10:02 -08005704 /**************************** F32 WAsm micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005705 #ifndef XNN_NO_F32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07005706 init_flags |= XNN_INIT_FLAG_F32;
5707
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005708 if (is_wasm_x86) {
Marat Dukhanaefaef32020-04-09 07:09:34 -07005709 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_2x4__scalar);
5710 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_2x4__scalar);
5711 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__wasm);
5712 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__wasm);
Marat Dukhan467f6362020-05-22 23:21:55 -07005713 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_2x4__scalar);
5714 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_2x4__scalar);
5715 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__wasm);
5716 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__wasm);
Marat Dukhan869c62d2020-04-09 17:17:55 -07005717 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_2x4__scalar);
5718 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_2x4__scalar);
5719 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__wasm);
5720 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__wasm);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07005721 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005722 xnn_params.f32.gemm.mr = 2;
5723 xnn_params.f32.gemm.nr = 4;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005724 } else {
Marat Dukhanaefaef32020-04-09 07:09:34 -07005725 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__wasm);
5726 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__wasm);
5727 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__wasm);
5728 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__wasm);
Marat Dukhan467f6362020-05-22 23:21:55 -07005729 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__wasm);
5730 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__wasm);
5731 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__wasm);
5732 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__wasm);
Marat Dukhan869c62d2020-04-09 17:17:55 -07005733 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__wasm);
5734 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__wasm);
5735 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__wasm);
5736 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__wasm);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07005737 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005738 xnn_params.f32.gemm.mr = 4;
5739 xnn_params.f32.gemm.nr = 4;
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005740 }
Marat Dukhanaefaef32020-04-09 07:09:34 -07005741 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__wasm);
5742 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__wasm),
Marat Dukhan869c62d2020-04-09 17:17:55 -07005743 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__wasm);
5744 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__wasm),
Marat Dukhan104ae5e2021-05-24 13:41:57 -07005745 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005746 xnn_params.f32.gemm2.mr = 4;
5747 xnn_params.f32.gemm2.nr = 2;
5748
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07005749 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__wasm_acc2;
5750 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__wasm_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07005751 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005752 xnn_params.f32.dwconv[0].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07005753 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005754
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07005755 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__wasm_acc2;
5756 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__wasm_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07005757 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005758 xnn_params.f32.dwconv[1].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07005759 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005760
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07005761 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__wasm_acc2;
5762 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__wasm_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07005763 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005764 xnn_params.f32.dwconv[2].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07005765 xnn_params.f32.dwconv[2].primary_tile = 9;
5766
5767 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__wasm_acc2;
5768 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__wasm_acc2;
5769 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
5770 xnn_params.f32.dwconv[3].channel_tile = 1;
5771 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhanaefaef32020-04-09 07:09:34 -07005772
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005773 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08005774 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasm_c1,
5775 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasm_c1,
5776 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5777 .primary_tile = 9,
5778 .incremental_tile = 8,
5779 .channel_tile = 1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005780 };
5781 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08005782 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasm_c1,
5783 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasm_c1,
5784 .primary_tile = 9,
5785 .incremental_tile = 8,
5786 .channel_tile = 1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005787 };
5788 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08005789 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1,
5790 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1,
5791 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5792 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
5793 .row_tile = 7,
5794 .channel_tile = 1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005795 };
5796 xnn_params.f32.maxpool = (struct maxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07005797 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasm_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -07005798 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005799 .mr = 9,
5800 .qr = 8,
5801 };
5802 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07005803 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005804 .mr = 4,
5805 };
5806 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07005807 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005808 .mr = 9,
5809 };
5810 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
Marat Dukhan99936602020-04-11 16:47:01 -07005811 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005812 .mr = 9,
5813 .qr = 8,
5814 };
Marat Dukhan660fd192020-03-10 04:55:30 -07005815 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
5816 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
Marat Dukhan69722492019-11-11 19:55:50 -08005817 .pixel_tile = 1,
5818 .channel_tile = 2,
5819 };
Marat Dukhane5efb162021-12-31 10:26:13 -08005820 xnn_params.f32.abs = (struct vunary_parameters) {
5821 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4,
5822 .element_tile = 4,
5823 };
Marat Dukhan94912792021-08-16 21:40:30 -07005824 xnn_params.f32.clamp = (struct vunary_parameters) {
5825 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__wasm_x4,
5826 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
5827 .element_tile = 4,
5828 };
Marat Dukhanc303fe62020-06-26 10:09:25 -07005829 if (is_wasm_x86) {
Marat Dukhan561d0682021-12-23 16:12:35 -08005830 xnn_params.f32.hswish = (struct vunary_parameters) {
5831 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08005832 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08005833 .element_tile = 4,
5834 };
Marat Dukhanc303fe62020-06-26 10:09:25 -07005835 } else {
Marat Dukhan561d0682021-12-23 16:12:35 -08005836 xnn_params.f32.hswish = (struct vunary_parameters) {
5837 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__wasm_x4,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08005838 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08005839 .element_tile = 4,
5840 };
Marat Dukhanc303fe62020-06-26 10:09:25 -07005841 }
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08005842 if (is_wasm_x86) {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08005843 xnn_params.f32.elu = (struct vunary_parameters) {
5844 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x2,
5845 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
5846 .element_tile = 2,
5847 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08005848 } else {
Marat Dukhan4a79ff22022-01-01 12:16:48 -08005849 xnn_params.f32.elu = (struct vunary_parameters) {
5850 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasm_rr2_p6_x6,
5851 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_p6_params,
5852 .element_tile = 6,
5853 };
Marat Dukhanb6bd4bc2020-12-01 17:01:40 -08005854 }
Marat Dukhan2894e992021-12-30 08:29:48 -08005855 xnn_params.f32.lrelu = (struct vunary_parameters) {
5856 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4,
5857 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
5858 .element_tile = 4,
5859 };
Marat Dukhane5efb162021-12-31 10:26:13 -08005860 xnn_params.f32.neg = (struct vunary_parameters) {
5861 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4,
5862 .element_tile = 4,
5863 };
Frank Barchard62c5e232020-07-21 17:42:19 -07005864 if (is_wasm_x86) {
Marat Dukhan6674d692021-05-05 22:27:00 -07005865 xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_vrelu_ukernel__scalar_x8;
Frank Barchard62c5e232020-07-21 17:42:19 -07005866 } else {
Marat Dukhan6674d692021-05-05 22:27:00 -07005867 xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_vrelu_ukernel__wasm_x8;
Frank Barchard62c5e232020-07-21 17:42:19 -07005868 }
Marat Dukhan0e801372022-01-04 00:10:41 -08005869 xnn_params.f32.rndne = (struct vunary_parameters) {
5870 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x4,
5871 .element_tile = 4,
5872 };
5873 xnn_params.f32.rndz = (struct vunary_parameters) {
5874 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x4,
5875 .element_tile = 4,
5876 };
5877 xnn_params.f32.rndu = (struct vunary_parameters) {
5878 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x4,
5879 .element_tile = 4,
5880 };
5881 xnn_params.f32.rndd = (struct vunary_parameters) {
5882 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x4,
5883 .element_tile = 4,
5884 };
Marat Dukhance834ad2022-01-03 00:22:01 -08005885 xnn_params.f32.sigmoid = (struct vunary_parameters) {
5886 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
5887 .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
5888 .element_tile = 2,
5889 };
Marat Dukhane5efb162021-12-31 10:26:13 -08005890 xnn_params.f32.sqr = (struct vunary_parameters) {
5891 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4,
5892 .element_tile = 4,
5893 };
Marat Dukhane72b2822021-12-30 14:46:58 -08005894 xnn_params.f32.sqrt = (struct vunary_parameters) {
5895 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
5896 .element_tile = 1,
5897 };
Marat Dukhan7c1f8082020-06-25 13:26:20 -07005898 if (is_wasm_x86) {
5899 xnn_params.f32.prelu = (struct prelu_parameters) {
5900 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
5901 .row_tile = 2,
5902 .channel_tile = 4,
5903 };
5904 } else {
5905 xnn_params.f32.prelu = (struct prelu_parameters) {
5906 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasm_2x4,
5907 .row_tile = 2,
5908 .channel_tile = 4,
5909 };
5910 }
Marat Dukhan4a5c7712022-01-05 22:43:13 -08005911 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
5912 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
5913 .init = xnn_init_f32_expminus_scalar_rr2_p5_params,
5914 .element_tile = 4,
5915 };
Marat Dukhan1edc4542020-01-27 12:40:13 -08005916 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
Marat Dukhanb1a0fc32019-12-02 19:32:02 -08005917 xnn_params.f32.vadd = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005918 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasm_x8,
5919 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x8,
5920 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08005921 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhanb1a0fc32019-12-02 19:32:02 -08005922 .element_tile = 8,
5923 };
Marat Dukhan69180502019-12-06 15:00:31 -08005924 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Frank Barchardb392f8e2020-10-27 10:46:44 -07005925 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasm_x8,
5926 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasm_x8,
5927 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasm_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08005928 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Frank Barchardb392f8e2020-10-27 10:46:44 -07005929 .element_tile = 8,
Marat Dukhan69180502019-12-06 15:00:31 -08005930 };
Marat Dukhan79e7f842019-12-05 14:35:50 -08005931 xnn_params.f32.vmax = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005932 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasm_x8,
5933 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x8,
5934 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x8,
Marat Dukhan79e7f842019-12-05 14:35:50 -08005935 .element_tile = 8,
5936 };
5937 xnn_params.f32.vmin = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005938 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasm_x8,
5939 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x8,
5940 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x8,
Marat Dukhan79e7f842019-12-05 14:35:50 -08005941 .element_tile = 8,
5942 };
Marat Dukhan1e782c42019-11-21 17:02:40 -08005943 xnn_params.f32.vmul = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005944 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasm_x8,
5945 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x8,
5946 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08005947 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhanca2733c2019-11-15 23:21:17 -08005948 .element_tile = 8,
5949 };
Marat Dukhan05f3f6d2019-12-03 15:13:53 -08005950 xnn_params.f32.vsub = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005951 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasm_x8,
5952 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasm_x8,
5953 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasm_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08005954 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhan05f3f6d2019-12-03 15:13:53 -08005955 .element_tile = 8,
5956 };
Marat Dukhanf7399262020-06-05 10:58:44 -07005957 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
Frank Barchard9c7308f2020-08-31 17:03:01 -07005958 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
5959 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
5960 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
Marat Dukhanf7399262020-06-05 10:58:44 -07005961 .element_tile = 8,
5962 };
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005963 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
Marat Dukhan9531e9f2020-07-24 15:25:02 -07005964 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__wasm_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07005965 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhan49e6ee92019-11-06 15:55:29 -08005966 .channel_tile = 1,
5967 .row_tile = 2,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005968 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08005969 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhancfbed0a2020-12-08 10:01:51 -08005970 init_flags |= XNN_INIT_FLAG_CHW_OPT;
5971
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005972 xnn_params.f32.spmm = (struct spmm_parameters) {
Marat Dukhan355ab432020-04-09 19:01:52 -07005973 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
Marat Dukhanbff791e2019-10-24 11:05:37 -07005974 .mr = 8,
Marat Dukhan8fe54e42019-10-10 14:12:59 -07005975 .nr = 1,
5976 };
Erich Elsenc6afd9b2019-10-24 16:10:53 -07005977 xnn_params.f32.spmm2 = (struct spmm_parameters) {
Marat Dukhan355ab432020-04-09 19:01:52 -07005978 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
Erich Elsenc6afd9b2019-10-24 16:10:53 -07005979 .mr = 8,
5980 .nr = 2,
5981 };
5982 xnn_params.f32.spmm4 = (struct spmm_parameters) {
Marat Dukhan355ab432020-04-09 19:01:52 -07005983 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
Erich Elsenc6afd9b2019-10-24 16:10:53 -07005984 .mr = 8,
5985 .nr = 4,
5986 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07005987 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
Marat Dukhan14fe0b22019-10-23 21:20:07 -07005988 .ukernel_with_symm_padding =
Marat Dukhan1f29b802020-05-15 23:46:39 -07005989 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
Marat Dukhan14fe0b22019-10-23 21:20:07 -07005990 .output_channel_tile = 4,
5991 .output_height_tile = 1,
5992 .output_width_tile = 1,
5993 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07005994 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
Marat Dukhan91249d22020-10-24 12:02:51 -07005995 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2,
Marat Dukhan14fe0b22019-10-23 21:20:07 -07005996 .output_width_tile = 1,
Marat Dukhan91249d22020-10-24 12:02:51 -07005997 .output_height_tile = 2,
Marat Dukhan14fe0b22019-10-23 21:20:07 -07005998 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07005999 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
Marat Dukhancf5b3c32020-10-25 19:21:10 -07006000 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc2,
Marat Dukhan14fe0b22019-10-23 21:20:07 -07006001 .output_width_tile = 1,
6002 .output_height_tile = 1,
6003 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07006004 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
6005 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5,
Marat Dukhana99918a2019-11-15 14:40:12 -08006006 .output_width_tile = 1,
6007 .output_height_tile = 1,
6008 };
Marat Dukhanbf715f92020-10-23 20:17:00 -07006009 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
6010 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5,
Marat Dukhana99918a2019-11-15 14:40:12 -08006011 .output_width_tile = 1,
6012 .output_height_tile = 1,
6013 };
Marat Dukhan1f29b802020-05-15 23:46:39 -07006014 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
6015 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
Marat Dukhan14fe0b22019-10-23 21:20:07 -07006016 .channel_tile = 1,
6017 };
Artsiom Ablavatski97918102020-10-27 15:52:59 -07006018 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
6019 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
6020 .channel_tile = 1,
6021 .pixel_tile = 4,
6022 };
Marat Dukhanefc47b82019-11-18 09:25:38 -08006023 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07006024 #endif // XNN_NO_F32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07006025
Frank Barchardb40ee632021-12-30 11:10:02 -08006026 /*************************** VCVT WAsm micro-kernels***************************/
Marat Dukhanaf2ba002021-10-24 14:21:41 -07006027 #ifndef XNN_NO_VCVT_OPERATORS
6028 init_flags |= XNN_INIT_FLAG_VCVT;
6029
Marat Dukhan134f9842021-12-29 19:57:31 -08006030 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
6031 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_x1,
6032 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
6033 .element_tile = 1,
6034 };
Marat Dukhanb7c1b712021-12-30 07:23:57 -08006035 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
6036 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_bitcast_x4,
6037 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_bitcast_params,
6038 .element_tile = 4,
6039 };
Marat Dukhan430b1732021-12-04 02:53:12 -08006040 if (is_wasm_x86) {
6041 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
Marat Dukhanbdf10992022-01-04 09:20:14 -08006042 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x1,
6043 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_imagic_params,
Marat Dukhan430b1732021-12-04 02:53:12 -08006044 .element_tile = 1,
6045 };
6046 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
Marat Dukhanbdf10992022-01-04 09:20:14 -08006047 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x1,
6048 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_imagic_params,
Marat Dukhan430b1732021-12-04 02:53:12 -08006049 .element_tile = 1,
6050 };
6051 } else {
6052 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
Marat Dukhanbdf10992022-01-04 09:20:14 -08006053 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_x4,
6054 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_fmagic_params,
Marat Dukhan430b1732021-12-04 02:53:12 -08006055 .element_tile = 4,
6056 };
6057 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
Marat Dukhanbdf10992022-01-04 09:20:14 -08006058 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_x4,
6059 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_fmagic_params,
Marat Dukhan430b1732021-12-04 02:53:12 -08006060 .element_tile = 4,
6061 };
6062 }
Marat Dukhanf92206b2021-12-10 17:02:07 -08006063 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
6064 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__scalar_x1,
6065 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
6066 .element_tile = 1,
6067 };
6068 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
6069 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__scalar_x1,
6070 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
6071 .element_tile = 1,
6072 };
Marat Dukhanaf2ba002021-10-24 14:21:41 -07006073 #endif // XNN_NO_VCVT_OPERATORS
6074
Frank Barchardb40ee632021-12-30 11:10:02 -08006075 /**************************** X32 WAsm micro-kernels****************************/
Marat Dukhan8fe54e42019-10-10 14:12:59 -07006076 #ifndef XNN_NO_X32_OPERATORS
Marat Dukhan854fb6b2020-06-19 12:33:44 -07006077 init_flags |= XNN_INIT_FLAG_X32;
6078
Marat Dukhan8fe54e42019-10-10 14:12:59 -07006079 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
6080 xnn_params.x32.zip = (struct zip_parameters) {
6081 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
6082 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
6083 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
6084 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
6085 };
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08006086 #ifndef XNN_NO_NCHW_OPERATORS
Marat Dukhanad71b9a2020-11-20 00:01:51 -08006087 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
6088 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
Artsiom Ablavatskibbe85062020-11-05 14:07:37 -08006089 .channel_tile = 1,
6090 .pixel_tile = 1,
6091 };
6092 #endif // XNN_NO_NCHW_OPERATORS
Marat Dukhan8fe54e42019-10-10 14:12:59 -07006093 #endif // XNN_NO_X32_OPERATORS
XNNPACK Teamb455b122019-09-27 18:10:33 -07006094
Frank Barchardb40ee632021-12-30 11:10:02 -08006095 /**************************** XX WAsm micro-kernels****************************/
Marat Dukhan933051b2021-08-07 16:26:15 -07006096 #ifndef XNN_NO_XX_OPERATORS
6097 init_flags |= XNN_INIT_FLAG_XX;
6098
6099 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
6100 xnn_params.xx.fill = (struct fill_parameters) {
6101 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
6102 .row_tile = 1,
6103 };
Marat Dukhan0461f2d2021-08-08 12:36:29 -07006104 xnn_params.xx.pad = (struct pad_parameters) {
6105 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
6106 .row_tile = 1,
6107 };
Marat Dukhan933051b2021-08-07 16:26:15 -07006108 #endif
6109
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006110#elif XNN_ARCH_RISCV
6111
Marat Dukhana198f002022-01-04 18:45:11 -08006112 /************************** QC8 RISC-V micro-kernels **************************/
6113 #ifndef XNN_NO_QC8_OPERATORS
6114 init_flags |= XNN_INIT_FLAG_QC8;
6115
6116 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6117 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6118 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6119 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6120 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_scalar_lrintf_params;
6121 xnn_params.qc8.gemm.mr = 3;
6122 xnn_params.qc8.gemm.nr = 4;
6123
6124 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf;
6125 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_scalar_lrintf_params;
6126 xnn_params.qc8.dwconv[0].channel_tile = 2;
6127 xnn_params.qc8.dwconv[0].primary_tile = 9;
6128 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x25__scalar_lrintf;
6129 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_scalar_lrintf_params;
6130 xnn_params.qc8.dwconv[1].channel_tile = 2;
6131 xnn_params.qc8.dwconv[1].primary_tile = 25;
6132 #endif // XNN_NO_QS8_OPERATORS
6133
6134 /************************** QS8 RISC-V micro-kernels **************************/
Marat Dukhan803c1f82021-05-12 00:13:37 -07006135 #ifndef XNN_NO_QS8_OPERATORS
6136 init_flags |= XNN_INIT_FLAG_QS8;
6137
Marat Dukhana198f002022-01-04 18:45:11 -08006138 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6139 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6140 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6141 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6142 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
Marat Dukhan15a35c02021-05-12 11:40:03 -07006143 xnn_params.qs8.gemm.mr = 3;
Marat Dukhan803c1f82021-05-12 00:13:37 -07006144 xnn_params.qs8.gemm.nr = 4;
6145
Marat Dukhana198f002022-01-04 18:45:11 -08006146 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf;
6147 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
Marat Dukhan803c1f82021-05-12 00:13:37 -07006148 xnn_params.qs8.dwconv[0].channel_tile = 2;
6149 xnn_params.qs8.dwconv[0].primary_tile = 9;
Marat Dukhana198f002022-01-04 18:45:11 -08006150 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x25__scalar_lrintf;
6151 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
Marat Dukhan4ed14882021-05-12 17:50:40 -07006152 xnn_params.qs8.dwconv[1].channel_tile = 2;
6153 xnn_params.qs8.dwconv[1].primary_tile = 25;
Marat Dukhan803c1f82021-05-12 00:13:37 -07006154
6155 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhan847ff5e2022-01-11 20:31:06 -08006156 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
6157 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
Marat Dukhan53f41062022-01-11 19:44:57 -08006158 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
6159 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08006160 .row_tile = 7,
6161 .channel_tile = 1,
Marat Dukhan803c1f82021-05-12 00:13:37 -07006162 };
6163
6164 xnn_params.qs8.vadd = (struct vbinary_parameters) {
6165 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x4,
6166 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
6167 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
Marat Dukhan64287252021-09-07 16:20:03 -07006168 .init.qs8_addsub = xnn_init_qs8_add_minmax_scalar_params,
Marat Dukhan803c1f82021-05-12 00:13:37 -07006169 .element_tile = 4,
6170 };
Marat Dukhana198f002022-01-04 18:45:11 -08006171 xnn_params.qs8.vmul = (struct vbinary_parameters) {
6172 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4,
6173 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
6174 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
6175 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params,
6176 .element_tile = 4,
6177 };
Marat Dukhan803c1f82021-05-12 00:13:37 -07006178 #endif // XNN_NO_QS8_OPERATORS
6179
Marat Dukhana198f002022-01-04 18:45:11 -08006180 /************************** QU8 RISC-V micro-kernels **************************/
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006181 #ifndef XNN_NO_QU8_OPERATORS
6182 init_flags |= XNN_INIT_FLAG_QU8;
6183
Marat Dukhana198f002022-01-04 18:45:11 -08006184 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6185 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6186 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6187 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6188 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
6189 xnn_params.qu8.gemm.mr = 3;
6190 xnn_params.qu8.gemm.nr = 4;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006191
Marat Dukhana198f002022-01-04 18:45:11 -08006192 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf;
6193 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
6194 xnn_params.qu8.dwconv[0].channel_tile = 2;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006195 xnn_params.qu8.dwconv[0].primary_tile = 9;
Marat Dukhana198f002022-01-04 18:45:11 -08006196 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x25__scalar_lrintf;
6197 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
6198 xnn_params.qu8.dwconv[1].channel_tile = 2;
6199 xnn_params.qu8.dwconv[1].primary_tile = 25;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006200
6201 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08006202 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
6203 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
6204 .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
6205 .primary_tile = 9,
6206 .incremental_tile = 8,
6207 .channel_tile = 1,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006208 };
6209 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
Marat Dukhand1f53e42022-01-12 22:34:51 -08006210 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
6211 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
6212 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
6213 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
Marat Dukhanda382d12022-01-07 19:51:20 -08006214 .row_tile = 7,
6215 .channel_tile = 1,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006216 };
Marat Dukhandb007cd2021-07-20 23:42:39 -07006217
6218 xnn_params.qu8.vadd = (struct vbinary_parameters) {
6219 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x4,
6220 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
6221 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
Marat Dukhan64287252021-09-07 16:20:03 -07006222 .init.qu8_addsub = xnn_init_qu8_add_minmax_scalar_params,
Marat Dukhandb007cd2021-07-20 23:42:39 -07006223 .element_tile = 4,
6224 };
Marat Dukhana198f002022-01-04 18:45:11 -08006225 xnn_params.qu8.vmul = (struct vbinary_parameters) {
6226 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4,
6227 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
6228 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
6229 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params,
6230 .element_tile = 4,
6231 };
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006232 #endif // XNN_NO_QU8_OPERATORS
6233
Marat Dukhana198f002022-01-04 18:45:11 -08006234 /************************** S8 RISC-V micro-kernels ***************************/
6235 #ifndef XNN_NO_S8_OPERATORS
6236 init_flags |= XNN_INIT_FLAG_S8;
6237
6238 xnn_params.s8.clamp = (struct vunary_parameters) {
6239 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__scalar_x4,
6240 .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
6241 .element_tile = 4,
6242 };
6243 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
6244 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__scalar_c1,
6245 .pixel_tile = 1,
6246 .channel_tile = 1,
6247 };
6248 xnn_params.s8.maxpool = (struct maxpool_parameters) {
6249 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
6250 .init.s8 = xnn_init_s8_minmax_scalar_params,
6251 .mr = 9,
6252 .qr = 8,
6253 };
6254 #endif // XNN_NO_S8_OPERATORS
6255
6256 /************************** U8 RISC-V micro-kernels ***************************/
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006257 #ifndef XNN_NO_U8_OPERATORS
6258 init_flags |= XNN_INIT_FLAG_U8;
6259
Marat Dukhan94912792021-08-16 21:40:30 -07006260 xnn_params.u8.clamp = (struct vunary_parameters) {
6261 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
6262 .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
6263 .element_tile = 4,
6264 };
Marat Dukhana198f002022-01-04 18:45:11 -08006265 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
6266 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__scalar_c1,
6267 .pixel_tile = 1,
6268 .channel_tile = 1,
6269 };
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006270 xnn_params.u8.maxpool = (struct maxpool_parameters) {
6271 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -07006272 .init.u8 = xnn_init_u8_minmax_scalar_params,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006273 .mr = 9,
6274 .qr = 8,
6275 };
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006276 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
6277 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
6278 #endif // XNN_NO_U8_OPERATORS
6279
Marat Dukhana198f002022-01-04 18:45:11 -08006280 /************************** X8 RISC-V micro-kernels ***************************/
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006281 #ifndef XNN_NO_X8_OPERATORS
6282 init_flags |= XNN_INIT_FLAG_X8;
6283
Marat Dukhand67539d2021-09-08 23:06:03 -07006284 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006285 xnn_params.x8.zip = (struct zip_parameters) {
6286 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
6287 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
6288 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
6289 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
6290 };
6291 #endif // XNN_NO_X8_OPERATORS
6292
Marat Dukhana198f002022-01-04 18:45:11 -08006293 /************************** F32 RISC-V micro-kernels **************************/
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006294 #ifndef XNN_NO_F32_OPERATORS
6295 init_flags |= XNN_INIT_FLAG_F32;
6296
6297 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__scalar);
6298 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__scalar);
6299 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__scalar);
6300 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__scalar);
6301 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__scalar);
6302 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__scalar);
6303 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__scalar);
6304 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__scalar);
6305 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__scalar);
6306 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__scalar);
6307 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar);
6308 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar);
Marat Dukhan104ae5e2021-05-24 13:41:57 -07006309 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006310 xnn_params.f32.gemm.mr = 4;
6311 xnn_params.f32.gemm.nr = 4;
6312
6313 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__scalar);
6314 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__scalar),
6315 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__scalar);
6316 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__scalar),
Marat Dukhan104ae5e2021-05-24 13:41:57 -07006317 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006318 xnn_params.f32.gemm2.mr = 4;
6319 xnn_params.f32.gemm2.nr = 2;
6320
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07006321 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__scalar_acc2;
6322 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07006323 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006324 xnn_params.f32.dwconv[0].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07006325 xnn_params.f32.dwconv[0].primary_tile = 3;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006326
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07006327 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__scalar_acc2;
6328 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07006329 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006330 xnn_params.f32.dwconv[1].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07006331 xnn_params.f32.dwconv[1].primary_tile = 4;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006332
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07006333 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__scalar_acc2;
6334 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__scalar_acc2;
Marat Dukhan104ae5e2021-05-24 13:41:57 -07006335 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006336 xnn_params.f32.dwconv[2].channel_tile = 1;
Artsiom Ablavatski47a74db2021-11-02 13:40:24 -07006337 xnn_params.f32.dwconv[2].primary_tile = 9;
6338
6339 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__scalar_acc2;
6340 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__scalar_acc2;
6341 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
6342 xnn_params.f32.dwconv[3].channel_tile = 1;
6343 xnn_params.f32.dwconv[3].primary_tile = 25;
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006344
6345 xnn_params.f32.avgpool = (struct avgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08006346 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__scalar_c1,
6347 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__scalar_c1,
6348 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
6349 .primary_tile = 9,
6350 .incremental_tile = 8,
6351 .channel_tile = 1,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006352 };
6353 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08006354 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__scalar_c1,
6355 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__scalar_c1,
6356 .primary_tile = 9,
6357 .incremental_tile = 8,
6358 .channel_tile = 1,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006359 };
6360 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
Marat Dukhanda382d12022-01-07 19:51:20 -08006361 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1,
6362 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1,
6363 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
6364 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
6365 .row_tile = 7,
6366 .channel_tile = 1,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006367 };
6368 xnn_params.f32.maxpool = (struct maxpool_parameters) {
6369 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__scalar_c1,
Marat Dukhan91ae1652021-08-15 19:19:49 -07006370 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006371 .mr = 9,
6372 .qr = 8,
6373 };
6374 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
6375 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
6376 .mr = 4,
6377 };
6378 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
6379 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
6380 .mr = 9,
6381 };
6382 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
6383 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
6384 .mr = 9,
6385 .qr = 8,
6386 };
6387 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
6388 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
6389 .pixel_tile = 1,
6390 .channel_tile = 2,
6391 };
Marat Dukhane5efb162021-12-31 10:26:13 -08006392 xnn_params.f32.abs = (struct vunary_parameters) {
6393 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4,
6394 .element_tile = 4,
6395 };
Marat Dukhana198f002022-01-04 18:45:11 -08006396 xnn_params.f32.clamp = (struct vunary_parameters) {
6397 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__scalar_x4,
6398 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6399 .element_tile = 4,
6400 };
6401 xnn_params.f32.elu = (struct vunary_parameters) {
6402 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4,
6403 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
6404 .element_tile = 4,
6405 };
Marat Dukhan561d0682021-12-23 16:12:35 -08006406 xnn_params.f32.hswish = (struct vunary_parameters) {
6407 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4,
Marat Dukhan0d10cc72021-12-23 19:49:19 -08006408 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
Marat Dukhan561d0682021-12-23 16:12:35 -08006409 .element_tile = 4,
6410 };
Marat Dukhana198f002022-01-04 18:45:11 -08006411 xnn_params.f32.lrelu = (struct vunary_parameters) {
6412 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4,
6413 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
6414 .element_tile = 4,
Marat Dukhan4a79ff22022-01-01 12:16:48 -08006415 };
Marat Dukhane5efb162021-12-31 10:26:13 -08006416 xnn_params.f32.neg = (struct vunary_parameters) {
6417 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4,
6418 .element_tile = 4,
6419 };
Marat Dukhan0e801372022-01-04 00:10:41 -08006420 xnn_params.f32.rndne = (struct vunary_parameters) {
Marat Dukhana198f002022-01-04 18:45:11 -08006421 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x1,
6422 .element_tile = 1,
Marat Dukhan0e801372022-01-04 00:10:41 -08006423 };
6424 xnn_params.f32.rndz = (struct vunary_parameters) {
Marat Dukhana198f002022-01-04 18:45:11 -08006425 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x1,
6426 .element_tile = 1,
Marat Dukhan0e801372022-01-04 00:10:41 -08006427 };
6428 xnn_params.f32.rndu = (struct vunary_parameters) {
Marat Dukhana198f002022-01-04 18:45:11 -08006429 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x1,
6430 .element_tile = 1,
Marat Dukhan0e801372022-01-04 00:10:41 -08006431 };
6432 xnn_params.f32.rndd = (struct vunary_parameters) {
Marat Dukhana198f002022-01-04 18:45:11 -08006433 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x1,
6434 .element_tile = 1,
Marat Dukhan0e801372022-01-04 00:10:41 -08006435 };
Marat Dukhance834ad2022-01-03 00:22:01 -08006436 xnn_params.f32.sigmoid = (struct vunary_parameters) {
6437 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
6438 .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
6439 .element_tile = 2,
6440 };
Marat Dukhane5efb162021-12-31 10:26:13 -08006441 xnn_params.f32.sqr = (struct vunary_parameters) {
6442 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4,
6443 .element_tile = 4,
6444 };
Marat Dukhane72b2822021-12-30 14:46:58 -08006445 xnn_params.f32.sqrt = (struct vunary_parameters) {
6446 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
6447 .element_tile = 1,
6448 };
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006449 xnn_params.f32.prelu = (struct prelu_parameters) {
6450 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
Marat Dukhana198f002022-01-04 18:45:11 -08006451 .row_tile = 4,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006452 .channel_tile = 4,
6453 };
Marat Dukhan4a5c7712022-01-05 22:43:13 -08006454 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
6455 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
6456 .init = xnn_init_f32_expminus_scalar_rr2_p5_params,
6457 .element_tile = 4,
6458 };
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006459 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
6460 xnn_params.f32.vadd = (struct vbinary_parameters) {
6461 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__scalar_x8,
6462 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
6463 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08006464 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006465 .element_tile = 8,
6466 };
6467 xnn_params.f32.vdiv = (struct vbinary_parameters) {
Marat Dukhana198f002022-01-04 18:45:11 -08006468 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
6469 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
6470 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
Marat Dukhanf6004972021-12-30 11:23:02 -08006471 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhana198f002022-01-04 18:45:11 -08006472 .element_tile = 2,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006473 };
6474 xnn_params.f32.vmax = (struct vbinary_parameters) {
6475 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__scalar_x8,
6476 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
6477 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
6478 .element_tile = 8,
6479 };
6480 xnn_params.f32.vmin = (struct vbinary_parameters) {
6481 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__scalar_x8,
6482 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
6483 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
6484 .element_tile = 8,
6485 };
6486 xnn_params.f32.vmul = (struct vbinary_parameters) {
6487 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__scalar_x8,
6488 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
6489 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08006490 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006491 .element_tile = 8,
6492 };
6493 xnn_params.f32.vsub = (struct vbinary_parameters) {
6494 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__scalar_x8,
6495 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__scalar_x8,
6496 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__scalar_x8,
Marat Dukhanf6004972021-12-30 11:23:02 -08006497 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006498 .element_tile = 8,
6499 };
6500 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
6501 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
6502 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
6503 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
6504 .element_tile = 8,
6505 };
6506 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
6507 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__scalar_2x,
Marat Dukhanc58453f2021-05-24 14:47:38 -07006508 .init.f32 = xnn_init_f32_minmax_scalar_params,
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006509 .channel_tile = 1,
6510 .row_tile = 2,
6511 };
6512 #ifndef XNN_NO_NCHW_OPERATORS
6513 init_flags |= XNN_INIT_FLAG_CHW_OPT;
6514
6515 xnn_params.f32.spmm = (struct spmm_parameters) {
6516 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
6517 .mr = 8,
6518 .nr = 1,
6519 };
6520 xnn_params.f32.spmm2 = (struct spmm_parameters) {
6521 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
6522 .mr = 8,
6523 .nr = 2,
6524 };
6525 xnn_params.f32.spmm4 = (struct spmm_parameters) {
6526 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
6527 .mr = 8,
6528 .nr = 4,
6529 };
6530 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
6531 .ukernel_with_symm_padding =
6532 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
6533 .output_channel_tile = 4,
6534 .output_height_tile = 1,
6535 .output_width_tile = 1,
6536 };
6537 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
6538 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2,
6539 .output_width_tile = 1,
6540 .output_height_tile = 2,
6541 };
6542 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
6543 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc2,
6544 .output_width_tile = 1,
6545 .output_height_tile = 1,
6546 };
6547 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
6548 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5,
6549 .output_width_tile = 1,
6550 .output_height_tile = 1,
6551 };
6552 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
6553 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5,
6554 .output_width_tile = 1,
6555 .output_height_tile = 1,
6556 };
6557 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
6558 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
6559 .channel_tile = 1,
6560 };
6561 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
6562 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
6563 .channel_tile = 1,
6564 .pixel_tile = 4,
6565 };
6566 #endif // XNN_NO_NCHW_OPERATORS
6567 #endif // XNN_NO_F32_OPERATORS
6568
Marat Dukhana198f002022-01-04 18:45:11 -08006569 /************************** VCVT RISC-V micro-kernels *************************/
6570 #ifndef XNN_NO_VCVT_OPERATORS
6571 init_flags |= XNN_INIT_FLAG_VCVT;
6572
6573 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
6574 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_x4,
6575 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
6576 .element_tile = 4,
6577 };
6578 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
6579 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_fabsf_x2,
6580 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_fabsf_params,
6581 .element_tile = 2,
6582 };
6583 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
6584 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__scalar_lrintf_x4,
6585 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_lrintf_params,
6586 .element_tile = 4,
6587 };
6588 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
6589 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_lrintf_x4,
6590 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_lrintf_params,
6591 .element_tile = 4,
6592 };
6593 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
6594 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__scalar_x4,
6595 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
6596 .element_tile = 4,
6597 };
6598 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
6599 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__scalar_x4,
6600 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
6601 .element_tile = 4,
6602 };
6603 #endif // XNN_NO_VCVT_OPERATORS
6604
6605 /************************** X32 RISC-V micro-kernels **************************/
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006606 #ifndef XNN_NO_X32_OPERATORS
6607 init_flags |= XNN_INIT_FLAG_X32;
6608
Marat Dukhane3cb19b2021-05-07 03:38:49 -07006609 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
6610 xnn_params.x32.zip = (struct zip_parameters) {
6611 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
6612 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
6613 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
6614 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
6615 };
6616 #ifndef XNN_NO_NCHW_OPERATORS
6617 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
6618 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
6619 .channel_tile = 1,
6620 .pixel_tile = 1,
6621 };
6622 #endif // XNN_NO_NCHW_OPERATORS
6623 #endif // XNN_NO_X32_OPERATORS
6624
Marat Dukhana198f002022-01-04 18:45:11 -08006625 /************************** XX RISC-V micro-kernels ***************************/
Marat Dukhan0461f2d2021-08-08 12:36:29 -07006626 #ifndef XNN_NO_XX_OPERATORS
6627 init_flags |= XNN_INIT_FLAG_XX;
6628
6629 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
6630 xnn_params.xx.fill = (struct fill_parameters) {
6631 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
6632 .row_tile = 1,
6633 };
6634 xnn_params.xx.pad = (struct pad_parameters) {
6635 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
6636 .row_tile = 1,
6637 };
Marat Dukhana198f002022-01-04 18:45:11 -08006638 #endif // XNN_NO_XX_OPERATORS
Marat Dukhan0461f2d2021-08-08 12:36:29 -07006639
XNNPACK Teamb455b122019-09-27 18:10:33 -07006640#else
6641 #error "Unsupported architecture"
6642#endif
Marat Dukhan496389f2021-04-07 15:47:12 -07006643
6644 memcpy(&xnn_params.allocator, init_allocator, sizeof(struct xnn_allocator));
Marat Dukhan854fb6b2020-06-19 12:33:44 -07006645 xnn_params.init_flags = init_flags;
XNNPACK Teamb455b122019-09-27 18:10:33 -07006646}
6647
Zhi An Ng0db15d32021-12-10 16:45:06 -08006648#if XNN_PLATFORM_WINDOWS
Marat Dukhan57133c02020-04-13 00:54:59 -07006649 static BOOL CALLBACK init_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) {
6650 init();
6651 return TRUE;
6652 }
6653#endif
6654
Marat Dukhan04f03be2019-11-19 12:36:47 -08006655enum xnn_status xnn_initialize(const struct xnn_allocator* allocator) {
Marat Dukhana198f002022-01-04 18:45:11 -08006656 #if !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
Marat Dukhand343c222019-10-07 09:22:14 -07006657 if (!cpuinfo_initialize()) {
6658 return xnn_status_out_of_memory;
6659 }
Marat Dukhana198f002022-01-04 18:45:11 -08006660 #endif // !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
Marat Dukhan496389f2021-04-07 15:47:12 -07006661 if (allocator == NULL) {
6662 allocator = &xnn_default_allocator;
6663 }
6664 #ifdef _MSC_VER
Marat Dukhandf94d982021-06-01 12:21:33 -07006665 _InterlockedCompareExchangePointer((PVOID volatile*) &init_allocator, (PVOID) allocator, NULL);
Marat Dukhan496389f2021-04-07 15:47:12 -07006666 #else
6667 __sync_bool_compare_and_swap(&init_allocator, NULL, allocator);
6668 #endif
Zhi An Ng0db15d32021-12-10 16:45:06 -08006669 #if XNN_PLATFORM_WINDOWS
Marat Dukhan57133c02020-04-13 00:54:59 -07006670 InitOnceExecuteOnce(&init_guard, &init_windows, NULL, NULL);
6671 #else
6672 pthread_once(&init_guard, &init);
6673 #endif
Marat Dukhan854fb6b2020-06-19 12:33:44 -07006674 if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) != 0) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07006675 return xnn_status_success;
6676 } else {
6677 return xnn_status_unsupported_hardware;
6678 }
6679}
6680
6681enum xnn_status xnn_deinitialize(void) {
Marat Dukhana198f002022-01-04 18:45:11 -08006682 #if !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
Marat Dukhand343c222019-10-07 09:22:14 -07006683 cpuinfo_deinitialize();
Marat Dukhana198f002022-01-04 18:45:11 -08006684 #endif // !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
XNNPACK Teamb455b122019-09-27 18:10:33 -07006685 return xnn_status_success;
Marat Dukhan3b9b4bc2021-08-24 00:23:07 -07006686}