blob: ca2979924aed577b935c6db3460a8255533b5e0d [file] [log] [blame]
Marat Dukhanab582382020-07-06 13:32:08 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <stdint.h>
10#include <stddef.h>
Marat Dukhanaf1671a2022-02-04 00:32:09 -080011#include <string.h>
Marat Dukhan6989ec42022-01-14 17:14:35 -080012
13#include <fp16.h>
Marat Dukhanab582382020-07-06 13:32:08 -070014
15#include <xnnpack/math.h>
16#include <xnnpack/pack.h>
17
18
Marat Dukhana6879bd2020-07-06 14:25:08 -070019void xnn_pack_f32_gemm_goi_w(
20 size_t g,
21 size_t nc,
22 size_t kc,
23 size_t nr,
24 size_t kr,
25 size_t sr,
26 const float* k,
27 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -070028 float* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -070029 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -070030 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -070031{
Marat Dukhanfbd67a72022-01-31 18:03:50 -080032 assert(nr >= sr);
33
Marat Dukhana6879bd2020-07-06 14:25:08 -070034 const size_t skr = sr * kr;
Marat Dukhana6879bd2020-07-06 14:25:08 -070035 do {
36 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
37 const size_t nr_block_size = min(nc - nr_block_start, nr);
38 if XNN_LIKELY(b != NULL) {
39 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
40 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
41 }
42 }
43 packed_w += nr;
44
Marat Dukhanfbd67a72022-01-31 18:03:50 -080045 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
Marat Dukhana6879bd2020-07-06 14:25:08 -070046 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
47 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -080048 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
49 if (kc_idx < kc) {
50 packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
51 }
Marat Dukhana6879bd2020-07-06 14:25:08 -070052 }
Marat Dukhanfbd67a72022-01-31 18:03:50 -080053 packed_w += kr;
Marat Dukhana6879bd2020-07-06 14:25:08 -070054 }
55 packed_w += (nr - nr_block_size) * kr;
56 }
Marat Dukhane06c8132021-06-03 08:59:11 -070057 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -070058 }
59 k += nc * kc;
60 if XNN_UNPREDICTABLE(b != NULL) {
61 b += nc;
62 }
63 } while (--g != 0);
64}
65
66void xnn_pack_f16_gemm_goi_w(
67 size_t g,
68 size_t nc,
69 size_t kc,
70 size_t nr,
71 size_t kr,
72 size_t sr,
73 const uint16_t* k,
74 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -070075 uint16_t* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -070076 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -070077 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -070078{
Marat Dukhanfbd67a72022-01-31 18:03:50 -080079 assert(nr >= sr);
80
Marat Dukhana6879bd2020-07-06 14:25:08 -070081 const size_t skr = sr * kr;
Marat Dukhana6879bd2020-07-06 14:25:08 -070082 do {
83 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
84 const size_t nr_block_size = min(nc - nr_block_start, nr);
85 if XNN_LIKELY(b != NULL) {
86 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
87 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
88 }
89 }
90 packed_w += nr;
91
Marat Dukhanfbd67a72022-01-31 18:03:50 -080092 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
Marat Dukhana6879bd2020-07-06 14:25:08 -070093 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
94 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -080095 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
96 if (kc_idx < kc) {
97 packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
98 }
Marat Dukhana6879bd2020-07-06 14:25:08 -070099 }
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800100 packed_w += kr;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700101 }
102 packed_w += (nr - nr_block_size) * kr;
103 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700104 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -0700105 }
106 k += nc * kc;
107 if XNN_UNPREDICTABLE(b != NULL) {
108 b += nc;
109 }
110 } while (--g != 0);
111}
112
Marat Dukhan6989ec42022-01-14 17:14:35 -0800113void xnn_pack_f32_to_f16_gemm_goi_w(
114 size_t g,
115 size_t nc,
116 size_t kc,
117 size_t nr,
118 size_t kr,
119 size_t sr,
120 const float* k,
121 const float* b,
122 uint16_t* packed_w,
123 size_t extra_bytes,
124 const void* params)
125{
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800126 assert(nr >= sr);
127
Marat Dukhan6989ec42022-01-14 17:14:35 -0800128 const size_t skr = sr * kr;
Marat Dukhan6989ec42022-01-14 17:14:35 -0800129 do {
130 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
131 const size_t nr_block_size = min(nc - nr_block_start, nr);
132 if XNN_LIKELY(b != NULL) {
133 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
134 packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
135 }
136 }
137 packed_w += nr;
138
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800139 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
Marat Dukhan6989ec42022-01-14 17:14:35 -0800140 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
141 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800142 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
143 if (kc_idx < kc) {
144 packed_w[kr_block_offset] = fp16_ieee_from_fp32_value(k[(nr_block_start + nr_block_offset) * kc + kc_idx]);
145 }
Marat Dukhan6989ec42022-01-14 17:14:35 -0800146 }
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800147 packed_w += kr;
Marat Dukhan6989ec42022-01-14 17:14:35 -0800148 }
149 packed_w += (nr - nr_block_size) * kr;
150 }
151 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
152 }
153 k += nc * kc;
154 if XNN_UNPREDICTABLE(b != NULL) {
155 b += nc;
156 }
157 } while (--g != 0);
158}
159
Marat Dukhan08b7a972020-07-14 18:17:29 -0700160void xnn_pack_qu8_gemm_goi_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700161 size_t g,
162 size_t nc,
163 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700164 size_t nr,
165 size_t kr,
Marat Dukhanbc08f312020-07-07 16:22:04 -0700166 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700167 const uint8_t* k,
168 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700169 void* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -0700170 size_t extra_bytes,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700171 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700172{
Marat Dukhan348c3772022-02-01 00:36:50 -0800173 assert(nr >= sr);
174
175 const size_t skr = sr * kr;
Marat Dukhanb42f8662020-07-06 20:46:13 -0700176 const int32_t izp = (int32_t) params->input_zero_point;
Marat Dukhan348c3772022-02-01 00:36:50 -0800177 const int32_t bzp = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700178 do {
179 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
180 const size_t nr_block_size = min(nc - nr_block_start, nr);
181 int32_t* packed_b = (int32_t*) packed_w;
182 if XNN_LIKELY(b != NULL) {
183 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
Marat Dukhan348c3772022-02-01 00:36:50 -0800184 *((int32_t*) packed_w) = bzp + b[nr_block_start + nr_block_offset];
185 packed_w = (int32_t*) packed_w + 1;
Marat Dukhanab582382020-07-06 13:32:08 -0700186 }
187 } else {
188 size_t n = nr_block_size;
189 do {
Marat Dukhan348c3772022-02-01 00:36:50 -0800190 *((int32_t*) packed_w) = bzp;
191 packed_w = (int32_t*) packed_w + 1;
Marat Dukhanab582382020-07-06 13:32:08 -0700192 } while (--n != 0);
193 }
Marat Dukhan348c3772022-02-01 00:36:50 -0800194 packed_w = (int32_t*) packed_w + (nr - nr_block_size);
195
196 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
Marat Dukhanab582382020-07-06 13:32:08 -0700197 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
198 int32_t ksum = 0;
Marat Dukhan348c3772022-02-01 00:36:50 -0800199 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
200 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
201 if (kc_idx < kc) {
202 const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
203 ksum += (int32_t) kv;
204 ((uint8_t*) packed_w)[kr_block_offset] = kv;
205 }
Marat Dukhanab582382020-07-06 13:32:08 -0700206 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700207 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhan348c3772022-02-01 00:36:50 -0800208 packed_w = (uint8_t*) packed_w + kr;
Marat Dukhanab582382020-07-06 13:32:08 -0700209 }
Marat Dukhan348c3772022-02-01 00:36:50 -0800210 packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
Marat Dukhanab582382020-07-06 13:32:08 -0700211 }
Marat Dukhan0b043742021-06-02 18:29:11 -0700212 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanab582382020-07-06 13:32:08 -0700213 }
214 k += nc * kc;
215 if XNN_UNPREDICTABLE(b != NULL) {
216 b += nc;
217 }
218 } while (--g != 0);
219}
220
Marat Dukhan595e1702020-07-31 10:12:52 -0700221void xnn_pack_qs8_gemm_goi_w(
222 size_t g,
223 size_t nc,
224 size_t kc,
225 size_t nr,
226 size_t kr,
227 size_t sr,
228 const int8_t* k,
229 const int32_t* b,
230 void* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -0700231 size_t extra_bytes,
Marat Dukhan595e1702020-07-31 10:12:52 -0700232 const struct xnn_qs8_packing_params* params)
233{
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800234 assert(nr >= sr);
235
Frank Barchard952cb512021-10-28 11:39:07 -0700236 const size_t skr = sr * kr;
Marat Dukhan595e1702020-07-31 10:12:52 -0700237 const int32_t izp = (int32_t) params->input_zero_point;
238 do {
239 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
240 const size_t nr_block_size = min(nc - nr_block_start, nr);
241 int32_t* packed_b = (int32_t*) packed_w;
242 if XNN_LIKELY(b != NULL) {
243 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
244 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800245 packed_w = (int32_t*) packed_w + 1;
Marat Dukhan595e1702020-07-31 10:12:52 -0700246 }
247 } else {
248 size_t n = nr_block_size;
249 do {
250 *((int32_t*) packed_w) = 0;
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800251 packed_w = (int32_t*) packed_w + 1;
Marat Dukhan595e1702020-07-31 10:12:52 -0700252 } while (--n != 0);
253 }
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800254 packed_w = (int32_t*) packed_w + (nr - nr_block_size);
Frank Barchard952cb512021-10-28 11:39:07 -0700255
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800256 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
Frank Barchard952cb512021-10-28 11:39:07 -0700257 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
258 int32_t ksum = 0;
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800259 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
260 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
261 if (kc_idx < kc) {
262 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
263 ksum += (int32_t) kv;
264 ((int8_t*) packed_w)[kr_block_offset] = kv;
265 }
Frank Barchard952cb512021-10-28 11:39:07 -0700266 }
267 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800268 packed_w = (int8_t*) packed_w + kr;
Frank Barchard952cb512021-10-28 11:39:07 -0700269 }
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800270 packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
Marat Dukhan595e1702020-07-31 10:12:52 -0700271 }
Marat Dukhan0b043742021-06-02 18:29:11 -0700272 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhan595e1702020-07-31 10:12:52 -0700273 }
274 k += nc * kc;
275 if XNN_UNPREDICTABLE(b != NULL) {
276 b += nc;
277 }
278 } while (--g != 0);
279}
280
Marat Dukhan683fab32020-08-03 19:42:52 -0700281void xnn_pack_qs8_gemm_xw_goi_w(
282 size_t g,
283 size_t nc,
284 size_t kc,
285 size_t nr,
286 size_t kr,
287 size_t sr,
288 const int8_t* k,
289 const int32_t* b,
290 void* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -0700291 size_t extra_bytes,
Marat Dukhan683fab32020-08-03 19:42:52 -0700292 const struct xnn_qs8_packing_params* params)
293{
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800294 assert(nr >= sr);
295
Frank Barchard66ae2572021-11-02 17:36:21 -0700296 const size_t skr = sr * kr;
Marat Dukhan683fab32020-08-03 19:42:52 -0700297 const int32_t izp = (int32_t) params->input_zero_point;
298 do {
299 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
300 const size_t nr_block_size = min(nc - nr_block_start, nr);
301 int32_t* packed_b = (int32_t*) packed_w;
302 if XNN_LIKELY(b != NULL) {
303 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
304 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
305 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
306 }
307 } else {
308 size_t n = nr_block_size;
309 do {
310 *((int32_t*) packed_w) = 0;
311 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
312 } while (--n != 0);
313 }
314 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
Frank Barchard66ae2572021-11-02 17:36:21 -0700315
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800316 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
Frank Barchard66ae2572021-11-02 17:36:21 -0700317 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
318 int32_t ksum = 0;
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800319 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
320 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
321 if (kc_idx < kc) {
322 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
323 ksum += (int32_t) kv;
324 ((int16_t*) packed_w)[kr_block_offset] = (int16_t) kv;
325 }
Frank Barchard66ae2572021-11-02 17:36:21 -0700326 }
327 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800328 packed_w = (int16_t*) packed_w + kr;
Frank Barchard66ae2572021-11-02 17:36:21 -0700329 }
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800330 packed_w = (int16_t*) packed_w + (nr - nr_block_size) * kr;
Marat Dukhan683fab32020-08-03 19:42:52 -0700331 }
Marat Dukhan0b043742021-06-02 18:29:11 -0700332 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhan683fab32020-08-03 19:42:52 -0700333 }
334 k += nc * kc;
335 if XNN_UNPREDICTABLE(b != NULL) {
336 b += nc;
337 }
338 } while (--g != 0);
339}
340
Marat Dukhana6879bd2020-07-06 14:25:08 -0700341void xnn_pack_f32_gemm_io_w(
342 size_t nc,
343 size_t kc,
344 size_t nr,
345 size_t kr,
346 size_t sr,
347 const float* k,
348 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700349 float* packed_w,
350 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700351{
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800352 assert(nr >= sr);
353
Marat Dukhana6879bd2020-07-06 14:25:08 -0700354 const size_t skr = sr * kr;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700355 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
356 const size_t nr_block_size = min(nc - nr_block_start, nr);
357 if XNN_LIKELY(b != NULL) {
358 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
359 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
360 }
361 }
362 packed_w += nr;
363
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800364 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
Marat Dukhana6879bd2020-07-06 14:25:08 -0700365 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
366 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800367 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
368 if (kc_idx < kc) {
369 packed_w[kr_block_offset] = k[kc_idx * nc + nr_block_start + nr_block_offset];
370 }
Marat Dukhana6879bd2020-07-06 14:25:08 -0700371 }
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800372 packed_w += kr;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700373 }
374 packed_w += (nr - nr_block_size) * kr;
375 }
376 }
377}
378
379void xnn_pack_f16_gemm_io_w(
380 size_t nc,
381 size_t kc,
382 size_t nr,
383 size_t kr,
384 size_t sr,
385 const uint16_t* k,
386 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700387 uint16_t* packed_w,
388 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700389{
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800390 assert(nr >= sr);
391
Marat Dukhana6879bd2020-07-06 14:25:08 -0700392 const size_t skr = sr * kr;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700393 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
394 const size_t nr_block_size = min(nc - nr_block_start, nr);
395 if XNN_LIKELY(b != NULL) {
396 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
397 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
398 }
399 }
400 packed_w += nr;
401
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800402 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
Marat Dukhana6879bd2020-07-06 14:25:08 -0700403 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
404 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800405 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
406 if (kc_idx < kc) {
407 packed_w[kr_block_offset] = k[kc_idx * nc + nr_block_start + nr_block_offset];
408 }
Marat Dukhana6879bd2020-07-06 14:25:08 -0700409 }
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800410 packed_w += kr;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700411 }
412 packed_w += (nr - nr_block_size) * kr;
413 }
414 }
415}
416
Marat Dukhan1d6b7c92022-01-14 21:18:44 -0800417void xnn_pack_f32_to_f16_gemm_io_w(
418 size_t nc,
419 size_t kc,
420 size_t nr,
421 size_t kr,
422 size_t sr,
423 const float* k,
424 const float* b,
425 uint16_t* packed_w,
426 const void* params)
427{
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800428 assert(nr >= sr);
429
Marat Dukhan1d6b7c92022-01-14 21:18:44 -0800430 const size_t skr = sr * kr;
Marat Dukhan1d6b7c92022-01-14 21:18:44 -0800431 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
432 const size_t nr_block_size = min(nc - nr_block_start, nr);
433 if XNN_LIKELY(b != NULL) {
434 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
435 packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
436 }
437 }
438 packed_w += nr;
439
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800440 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
Marat Dukhan1d6b7c92022-01-14 21:18:44 -0800441 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
442 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800443 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
444 if (kc_idx < kc) {
445 packed_w[kr_block_offset] = fp16_ieee_from_fp32_value(k[kc_idx * nc + nr_block_start + nr_block_offset]);
446 }
Marat Dukhan1d6b7c92022-01-14 21:18:44 -0800447 }
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800448 packed_w += kr;
Marat Dukhan1d6b7c92022-01-14 21:18:44 -0800449 }
450 packed_w += (nr - nr_block_size) * kr;
451 }
452 }
453}
454
Marat Dukhan08b7a972020-07-14 18:17:29 -0700455void xnn_pack_qu8_gemm_io_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700456 size_t nc,
457 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700458 size_t nr,
459 size_t kr,
Marat Dukhanbc08f312020-07-07 16:22:04 -0700460 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700461 const uint8_t* k,
462 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700463 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700464 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700465{
Marat Dukhan58cdcf22022-02-01 02:05:00 -0800466 assert(nr >= sr);
467
468 const size_t skr = sr * kr;
Marat Dukhanb42f8662020-07-06 20:46:13 -0700469 const int32_t izp = (int32_t) params->input_zero_point;
Marat Dukhan58cdcf22022-02-01 02:05:00 -0800470 const int32_t bzp = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700471 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
472 const size_t nr_block_size = min(nc - nr_block_start, nr);
473 int32_t* packed_b = (int32_t*) packed_w;
474 if XNN_LIKELY(b != NULL) {
475 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
Marat Dukhan58cdcf22022-02-01 02:05:00 -0800476 *((int32_t*) packed_w) = bzp + b[nr_block_start + nr_block_offset];
477 packed_w = (int32_t*) packed_w + 1;
Marat Dukhanab582382020-07-06 13:32:08 -0700478 }
479 } else {
480 size_t n = nr_block_size;
481 do {
Marat Dukhan58cdcf22022-02-01 02:05:00 -0800482 *((int32_t*) packed_w) = bzp;
483 packed_w = (int32_t*) packed_w + 1;
Marat Dukhanab582382020-07-06 13:32:08 -0700484 } while (--n != 0);
485 }
Marat Dukhan58cdcf22022-02-01 02:05:00 -0800486 packed_w = (int32_t*) packed_w + (nr - nr_block_size);
487
488 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
Marat Dukhanab582382020-07-06 13:32:08 -0700489 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
490 int32_t ksum = 0;
Marat Dukhan58cdcf22022-02-01 02:05:00 -0800491 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
492 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
493 if (kc_idx < kc) {
494 const uint8_t kv = k[kc_idx * nc + (nr_block_start + nr_block_offset)];
495 ksum += (int32_t) kv;
496 ((uint8_t*) packed_w)[kr_block_offset] = kv;
497 }
Marat Dukhanab582382020-07-06 13:32:08 -0700498 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700499 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhan58cdcf22022-02-01 02:05:00 -0800500 packed_w = (uint8_t*) packed_w + kr;
Marat Dukhanab582382020-07-06 13:32:08 -0700501 }
Marat Dukhan58cdcf22022-02-01 02:05:00 -0800502 packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
Marat Dukhanab582382020-07-06 13:32:08 -0700503 }
504 }
505}
506
Marat Dukhand23cb6e2021-04-01 01:18:58 -0700507void xnn_pack_qs8_gemm_io_w(
508 size_t nc,
509 size_t kc,
510 size_t nr,
511 size_t kr,
512 size_t sr,
513 const int8_t* k,
514 const int32_t* b,
515 void* packed_w,
516 const struct xnn_qs8_packing_params* params)
517{
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800518 assert(nr >= sr);
519
Frank Barchard66ae2572021-11-02 17:36:21 -0700520 const size_t skr = sr * kr;
Marat Dukhand23cb6e2021-04-01 01:18:58 -0700521 const int32_t izp = (int32_t) params->input_zero_point;
522 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
523 const size_t nr_block_size = min(nc - nr_block_start, nr);
524 int32_t* packed_b = (int32_t*) packed_w;
525 if XNN_LIKELY(b != NULL) {
526 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
527 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800528 packed_w = (int32_t*) packed_w + 1;
Marat Dukhand23cb6e2021-04-01 01:18:58 -0700529 }
530 } else {
531 size_t n = nr_block_size;
532 do {
533 *((int32_t*) packed_w) = 0;
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800534 packed_w = (int32_t*) packed_w + 1;
Marat Dukhand23cb6e2021-04-01 01:18:58 -0700535 } while (--n != 0);
536 }
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800537 packed_w = (uint32_t*) packed_w + (nr - nr_block_size);
Frank Barchard66ae2572021-11-02 17:36:21 -0700538
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800539 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
Frank Barchard66ae2572021-11-02 17:36:21 -0700540 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
541 int32_t ksum = 0;
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800542 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
543 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
544 if (kc_idx < kc) {
545 const int8_t kv = k[kc_idx * nc + (nr_block_start + nr_block_offset)];
546 ksum += (int32_t) kv;
547 ((int8_t*) packed_w)[kr_block_offset] = kv;
548 }
Frank Barchard66ae2572021-11-02 17:36:21 -0700549 }
550 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800551 packed_w = (int8_t*) packed_w + kr;
Frank Barchard66ae2572021-11-02 17:36:21 -0700552 }
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800553 packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
Marat Dukhand23cb6e2021-04-01 01:18:58 -0700554 }
555 }
556}
557
Marat Dukhana6879bd2020-07-06 14:25:08 -0700558void xnn_pack_f32_conv_goki_w(
559 size_t g,
560 size_t nc,
561 size_t ks,
562 size_t kc,
563 size_t nr,
564 size_t kr,
565 size_t sr,
566 const float* k,
567 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700568 float* packed_w,
Marat Dukhane06c8132021-06-03 08:59:11 -0700569 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700570 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700571{
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800572 assert(nr >= sr);
573
Marat Dukhana6879bd2020-07-06 14:25:08 -0700574 const size_t skr = sr * kr;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700575 do {
576 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
577 const size_t nr_block_size = min(nc - nr_block_start, nr);
578 if XNN_LIKELY(b != NULL) {
579 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
580 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
581 }
582 }
583 packed_w += nr;
584
585 for (size_t ki = 0; ki < ks; ki++) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800586 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
Marat Dukhana6879bd2020-07-06 14:25:08 -0700587 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
588 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800589 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
590 if (kc_idx < kc) {
591 packed_w[kr_block_offset] = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
592 }
Marat Dukhana6879bd2020-07-06 14:25:08 -0700593 }
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800594 packed_w += kr;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700595 }
596 packed_w += (nr - nr_block_size) * kr;
597 }
598 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700599 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -0700600 }
601 k += ks * kc * nc;
602 if XNN_UNPREDICTABLE(b != NULL) {
603 b += nc;
604 }
605 } while (--g != 0);
606}
607
608void xnn_pack_f16_conv_goki_w(
609 size_t g,
610 size_t nc,
611 size_t ks,
612 size_t kc,
613 size_t nr,
614 size_t kr,
615 size_t sr,
616 const uint16_t* k,
617 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700618 uint16_t* packed_w,
Marat Dukhane06c8132021-06-03 08:59:11 -0700619 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700620 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700621{
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800622 assert(nr >= sr);
623
Marat Dukhana6879bd2020-07-06 14:25:08 -0700624 const size_t skr = sr * kr;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700625 do {
626 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
627 const size_t nr_block_size = min(nc - nr_block_start, nr);
628 if XNN_LIKELY(b != NULL) {
629 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
630 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
631 }
632 }
633 packed_w += nr;
634
635 for (size_t ki = 0; ki < ks; ki++) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800636 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
Marat Dukhana6879bd2020-07-06 14:25:08 -0700637 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
638 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800639 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
640 if (kc_idx < kc) {
641 packed_w[kr_block_offset] = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
642 }
Marat Dukhana6879bd2020-07-06 14:25:08 -0700643 }
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800644 packed_w += kr;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700645 }
646 packed_w += (nr - nr_block_size) * kr;
647 }
648 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700649 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -0700650 }
651 k += ks * kc * nc;
652 if XNN_UNPREDICTABLE(b != NULL) {
653 b += nc;
654 }
655 } while (--g != 0);
656}
657
Marat Dukhan6989ec42022-01-14 17:14:35 -0800658void xnn_pack_f32_to_f16_conv_goki_w(
659 size_t g,
660 size_t nc,
661 size_t ks,
662 size_t kc,
663 size_t nr,
664 size_t kr,
665 size_t sr,
666 const float* k,
667 const float* b,
668 uint16_t* packed_w,
669 size_t extra_bytes,
670 const void* params)
671{
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800672 assert(nr >= sr);
673
Marat Dukhan6989ec42022-01-14 17:14:35 -0800674 const size_t skr = sr * kr;
Marat Dukhan6989ec42022-01-14 17:14:35 -0800675 do {
676 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
677 const size_t nr_block_size = min(nc - nr_block_start, nr);
678 if XNN_LIKELY(b != NULL) {
679 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
680 packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
681 }
682 }
683 packed_w += nr;
684
685 for (size_t ki = 0; ki < ks; ki++) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800686 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
Marat Dukhan6989ec42022-01-14 17:14:35 -0800687 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
688 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800689 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
690 if (kc_idx < kc) {
691 packed_w[kr_block_offset] = fp16_ieee_from_fp32_value(k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx]);
692 }
Marat Dukhan6989ec42022-01-14 17:14:35 -0800693 }
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800694 packed_w += kr;
Marat Dukhan6989ec42022-01-14 17:14:35 -0800695 }
696 packed_w += (nr - nr_block_size) * kr;
697 }
698 }
699 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
700 }
701 k += ks * kc * nc;
702 if XNN_UNPREDICTABLE(b != NULL) {
703 b += nc;
704 }
705 } while (--g != 0);
706}
707
Marat Dukhan08b7a972020-07-14 18:17:29 -0700708void xnn_pack_qu8_conv_goki_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700709 size_t g,
710 size_t nc,
711 size_t ks,
712 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700713 size_t nr,
714 size_t kr,
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700715 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700716 const uint8_t* k,
717 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700718 void* packed_w,
Marat Dukhane06c8132021-06-03 08:59:11 -0700719 size_t extra_bytes,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700720 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700721{
Marat Dukhan348c3772022-02-01 00:36:50 -0800722 assert(nr >= sr);
723
724 const size_t skr = sr * kr;
Marat Dukhanb42f8662020-07-06 20:46:13 -0700725 const int32_t izp = (int32_t) params->input_zero_point;
Marat Dukhan348c3772022-02-01 00:36:50 -0800726 const int32_t bzp = (int32_t) ks * (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700727 do {
728 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
729 const size_t nr_block_size = min(nc - nr_block_start, nr);
730 int32_t* packed_b = (int32_t*) packed_w;
731 if XNN_LIKELY(b != NULL) {
732 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
Marat Dukhan348c3772022-02-01 00:36:50 -0800733 *((int32_t*) packed_w) = bzp + b[nr_block_start + nr_block_offset];
Marat Dukhanab582382020-07-06 13:32:08 -0700734 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
735 }
736 } else {
737 size_t n = nr_block_size;
738 do {
Marat Dukhan348c3772022-02-01 00:36:50 -0800739 *((int32_t*) packed_w) = bzp;
Marat Dukhanab582382020-07-06 13:32:08 -0700740 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
741 } while (--n != 0);
742 }
743 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
Marat Dukhan348c3772022-02-01 00:36:50 -0800744
Marat Dukhanab582382020-07-06 13:32:08 -0700745 for (size_t ki = 0; ki < ks; ki++) {
Marat Dukhan348c3772022-02-01 00:36:50 -0800746 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
Marat Dukhanab582382020-07-06 13:32:08 -0700747 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
748 int32_t ksum = 0;
Marat Dukhan348c3772022-02-01 00:36:50 -0800749 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
750 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
751 if (kc_idx < kc) {
752 const uint8_t kv = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
753 ksum += (int32_t) kv;
754 ((uint8_t*) packed_w)[kr_block_offset] = kv;
755 }
Marat Dukhanab582382020-07-06 13:32:08 -0700756 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700757 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhan348c3772022-02-01 00:36:50 -0800758 packed_w = (uint8_t*) packed_w + kr;
Marat Dukhanab582382020-07-06 13:32:08 -0700759 }
Marat Dukhan348c3772022-02-01 00:36:50 -0800760 packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
Marat Dukhanab582382020-07-06 13:32:08 -0700761 }
762 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700763 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanab582382020-07-06 13:32:08 -0700764 }
765 k += ks * kc * nc;
766 if XNN_UNPREDICTABLE(b != NULL) {
767 b += nc;
768 }
769 } while (--g != 0);
770}
771
Marat Dukhanf9480682020-07-31 14:50:24 -0700772void xnn_pack_qs8_conv_goki_w(
773 size_t g,
774 size_t nc,
775 size_t ks,
776 size_t kc,
777 size_t nr,
778 size_t kr,
779 size_t sr,
780 const int8_t* k,
781 const int32_t* b,
782 void* packed_w,
Marat Dukhane06c8132021-06-03 08:59:11 -0700783 size_t extra_bytes,
Marat Dukhanf9480682020-07-31 14:50:24 -0700784 const struct xnn_qs8_packing_params* params)
785{
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800786 assert(nr >= sr);
787
Frank Barchard952cb512021-10-28 11:39:07 -0700788 const size_t skr = sr * kr;
Marat Dukhanf9480682020-07-31 14:50:24 -0700789 const int32_t izp = (int32_t) params->input_zero_point;
790 do {
791 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
792 const size_t nr_block_size = min(nc - nr_block_start, nr);
793 int32_t* packed_b = (int32_t*) packed_w;
794 if XNN_LIKELY(b != NULL) {
795 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
796 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
797 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
798 }
799 } else {
800 size_t n = nr_block_size;
801 do {
802 *((int32_t*) packed_w) = 0;
803 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
804 } while (--n != 0);
805 }
806 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
Frank Barchard952cb512021-10-28 11:39:07 -0700807
Marat Dukhanf9480682020-07-31 14:50:24 -0700808 for (size_t ki = 0; ki < ks; ki++) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800809 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
Frank Barchard952cb512021-10-28 11:39:07 -0700810 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
811 int32_t ksum = 0;
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800812 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
813 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
814 if (kc_idx < kc) {
815 const int8_t kv = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
816 ksum += (int32_t) kv;
817 ((int8_t*) packed_w)[kr_block_offset] = kv;
818 }
Frank Barchard952cb512021-10-28 11:39:07 -0700819 }
820 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800821 packed_w = (int8_t*) packed_w + kr;
Frank Barchard952cb512021-10-28 11:39:07 -0700822 }
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800823 packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
Frank Barchard952cb512021-10-28 11:39:07 -0700824 }
Marat Dukhanf9480682020-07-31 14:50:24 -0700825 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700826 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanf9480682020-07-31 14:50:24 -0700827 }
828 k += ks * kc * nc;
829 if XNN_UNPREDICTABLE(b != NULL) {
830 b += nc;
831 }
832 } while (--g != 0);
833}
834
Marat Dukhana6879bd2020-07-06 14:25:08 -0700835void xnn_pack_f32_conv_kgo_w(
836 size_t g,
837 size_t nc,
838 size_t ks,
839 size_t nr,
840 size_t kr,
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800841 size_t sr,
Marat Dukhana6879bd2020-07-06 14:25:08 -0700842 const float* k,
843 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700844 float* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -0700845 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700846 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700847{
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800848 assert(nr >= sr);
849
Marat Dukhana6879bd2020-07-06 14:25:08 -0700850 for (size_t i = 0; i < g; i++) {
851 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
852 const size_t nr_block_size = min(nc - nr_block_start, nr);
853 if XNN_LIKELY(b != NULL) {
854 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
855 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
856 }
857 }
858 packed_w += nr;
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800859
Marat Dukhana6879bd2020-07-06 14:25:08 -0700860 for (size_t ki = 0; ki < ks; ki++) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800861 for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
862 for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
863 packed_w[nr_block_offset * kr] = k[ki * g * nc + (nr_block_start + nr_block_offset)];
864 }
865 packed_w += nr * kr;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700866 }
Marat Dukhana6879bd2020-07-06 14:25:08 -0700867 }
Marat Dukhan97262462021-06-18 16:14:17 -0700868 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -0700869 }
870 k += nc;
871 if XNN_UNPREDICTABLE(b != NULL) {
872 b += nc;
873 }
874 }
875}
876
877void xnn_pack_f16_conv_kgo_w(
878 size_t g,
879 size_t nc,
880 size_t ks,
881 size_t nr,
882 size_t kr,
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800883 size_t sr,
Marat Dukhana6879bd2020-07-06 14:25:08 -0700884 const uint16_t* k,
885 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700886 uint16_t* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -0700887 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700888 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700889{
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800890 assert(nr >= sr);
891
Marat Dukhana6879bd2020-07-06 14:25:08 -0700892 for (size_t i = 0; i < g; i++) {
893 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
894 const size_t nr_block_size = min(nc - nr_block_start, nr);
895 if XNN_LIKELY(b != NULL) {
896 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
897 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
898 }
899 }
900 packed_w += nr;
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800901
Marat Dukhana6879bd2020-07-06 14:25:08 -0700902 for (size_t ki = 0; ki < ks; ki++) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800903 for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
904 for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
905 packed_w[nr_block_offset * kr] = k[ki * g * nc + (nr_block_start + nr_block_offset)];
906 }
907 packed_w += nr * kr;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700908 }
Marat Dukhana6879bd2020-07-06 14:25:08 -0700909 }
Marat Dukhan97262462021-06-18 16:14:17 -0700910 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -0700911 }
912 k += nc;
913 if XNN_UNPREDICTABLE(b != NULL) {
914 b += nc;
915 }
916 }
917}
918
Marat Dukhan6989ec42022-01-14 17:14:35 -0800919void xnn_pack_f32_to_f16_conv_kgo_w(
920 size_t g,
921 size_t nc,
922 size_t ks,
923 size_t nr,
924 size_t kr,
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800925 size_t sr,
Marat Dukhan6989ec42022-01-14 17:14:35 -0800926 const float* k,
927 const float* b,
928 uint16_t* packed_w,
929 size_t extra_bytes,
930 const void* params)
931{
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800932 assert(nr >= sr);
933
Marat Dukhan6989ec42022-01-14 17:14:35 -0800934 for (size_t i = 0; i < g; i++) {
935 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
936 const size_t nr_block_size = min(nc - nr_block_start, nr);
937 if XNN_LIKELY(b != NULL) {
938 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
939 packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
940 }
941 }
942 packed_w += nr;
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800943
Marat Dukhan6989ec42022-01-14 17:14:35 -0800944 for (size_t ki = 0; ki < ks; ki++) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800945 for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
946 for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
947 packed_w[nr_block_offset * kr] = fp16_ieee_from_fp32_value(k[ki * g * nc + (nr_block_start + nr_block_offset)]);
948 }
949 packed_w += nr * kr;
Marat Dukhan6989ec42022-01-14 17:14:35 -0800950 }
Marat Dukhan6989ec42022-01-14 17:14:35 -0800951 }
952 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
953 }
954 k += nc;
955 if XNN_UNPREDICTABLE(b != NULL) {
956 b += nc;
957 }
958 }
959}
960
Marat Dukhan08b7a972020-07-14 18:17:29 -0700961void xnn_pack_qu8_conv_kgo_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700962 size_t g,
963 size_t nc,
964 size_t ks,
Marat Dukhana6879bd2020-07-06 14:25:08 -0700965 size_t nr,
966 size_t kr,
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800967 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700968 const uint8_t* k,
969 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700970 void* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -0700971 size_t extra_bytes,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700972 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700973{
Marat Dukhanfbd67a72022-01-31 18:03:50 -0800974 assert(nr >= sr);
975
Marat Dukhanb42f8662020-07-06 20:46:13 -0700976 const int32_t izp = (int32_t) params->input_zero_point;
Marat Dukhan58cdcf22022-02-01 02:05:00 -0800977 const int32_t bzp = (int32_t) ks * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700978 for (size_t i = 0; i < g; i++) {
979 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
980 const size_t nr_block_size = min(nc - nr_block_start, nr);
981 int32_t* packed_b = (int32_t*) packed_w;
982 if XNN_LIKELY(b != NULL) {
983 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
Marat Dukhan58cdcf22022-02-01 02:05:00 -0800984 *((int32_t*) packed_w) = bzp + b[nr_block_start + nr_block_offset];
Marat Dukhanab582382020-07-06 13:32:08 -0700985 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
986 }
987 } else {
988 size_t n = nr_block_size;
989 do {
Marat Dukhan58cdcf22022-02-01 02:05:00 -0800990 *((int32_t*) packed_w) = bzp;
Marat Dukhanab582382020-07-06 13:32:08 -0700991 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
992 } while (--n != 0);
993 }
994 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
Marat Dukhan58cdcf22022-02-01 02:05:00 -0800995
Marat Dukhanab582382020-07-06 13:32:08 -0700996 for (size_t ki = 0; ki < ks; ki++) {
Marat Dukhan58cdcf22022-02-01 02:05:00 -0800997 for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
998 for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
999 const uint8_t kv = k[ki * g * nc + (nr_block_start + nr_block_offset)];
1000 ((uint8_t*) packed_w)[nr_block_offset * kr] = kv;
1001 packed_b[nr_block_offset] -= (int32_t) kv * izp;
1002 }
1003 packed_w = (uint8_t*) packed_w + nr * kr;
Marat Dukhanab582382020-07-06 13:32:08 -07001004 }
Marat Dukhanab582382020-07-06 13:32:08 -07001005 }
Marat Dukhan97262462021-06-18 16:14:17 -07001006 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanab582382020-07-06 13:32:08 -07001007 }
1008 k += nc;
1009 if XNN_UNPREDICTABLE(b != NULL) {
1010 b += nc;
1011 }
1012 }
1013}
1014
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001015void xnn_pack_qs8_conv_kgo_w(
1016 size_t g,
1017 size_t nc,
1018 size_t ks,
1019 size_t nr,
1020 size_t kr,
Marat Dukhanfbd67a72022-01-31 18:03:50 -08001021 size_t sr,
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001022 const int8_t* k,
1023 const int32_t* b,
1024 void* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -07001025 size_t extra_bytes,
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001026 const struct xnn_qs8_packing_params* params)
1027{
Marat Dukhanfbd67a72022-01-31 18:03:50 -08001028 assert(nr >= sr);
1029
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001030 const int32_t izp = (int32_t) params->input_zero_point;
1031 for (size_t i = 0; i < g; i++) {
1032 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1033 const size_t nr_block_size = min(nc - nr_block_start, nr);
1034 int32_t* packed_b = (int32_t*) packed_w;
1035 if XNN_LIKELY(b != NULL) {
1036 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1037 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
1038 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1039 }
1040 } else {
1041 size_t n = nr_block_size;
1042 do {
1043 *((int32_t*) packed_w) = 0;
1044 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1045 } while (--n != 0);
1046 }
1047 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
Marat Dukhanfbd67a72022-01-31 18:03:50 -08001048
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001049 for (size_t ki = 0; ki < ks; ki++) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -08001050 for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
1051 for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
1052 const int8_t kv = k[ki * g * nc + (nr_block_start + nr_block_offset)];
1053 ((int8_t*) packed_w)[nr_block_offset * kr] = kv;
1054 packed_b[nr_block_offset] -= (int32_t) kv * izp;
1055 }
1056 packed_w = (int8_t*) packed_w + nr * kr;
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001057 }
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001058 }
Marat Dukhan97262462021-06-18 16:14:17 -07001059 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001060 }
1061 k += nc;
1062 if XNN_UNPREDICTABLE(b != NULL) {
1063 b += nc;
1064 }
1065 }
1066}
1067
Marat Dukhana6879bd2020-07-06 14:25:08 -07001068void xnn_pack_f32_deconv_goki_w(
1069 size_t g,
1070 size_t nc,
1071 size_t kh,
1072 size_t kw,
1073 size_t kc,
1074 size_t sh,
1075 size_t sw,
1076 size_t nr,
1077 size_t kr,
1078 size_t sr,
1079 const float* k,
1080 const float* b,
1081 float* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001082 struct subconvolution_params* subconv_params,
1083 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001084{
Marat Dukhanfbd67a72022-01-31 18:03:50 -08001085 assert(nr >= sr);
1086
Marat Dukhana6879bd2020-07-06 14:25:08 -07001087 const size_t skr = sr * kr;
Marat Dukhana6879bd2020-07-06 14:25:08 -07001088 for (size_t i = 0; i < g; i++) {
1089 for (size_t oy = 0; oy < sh; oy++) {
1090 for (size_t ox = 0; ox < sw; ox++) {
1091 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -07001092 (*subconv_params++).weights = packed_w;
Marat Dukhana6879bd2020-07-06 14:25:08 -07001093 }
1094 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1095 const size_t nr_block_size = min(nc - nr_block_start, nr);
1096 if XNN_LIKELY(b != NULL) {
1097 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1098 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
1099 }
1100 }
1101 packed_w += nr;
1102 for (size_t ky = oy; ky < kh; ky += sh) {
1103 for (size_t kx = ox; kx < kw; kx += sw) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -08001104 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001105 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1106 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -08001107 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1108 if (kc_idx < kc) {
1109 packed_w[kr_block_offset] = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1110 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001111 }
Marat Dukhanfbd67a72022-01-31 18:03:50 -08001112 packed_w += kr;
Marat Dukhana6879bd2020-07-06 14:25:08 -07001113 }
1114 packed_w += (nr - nr_block_size) * kr;
1115 }
1116 }
1117 }
1118 }
1119 }
1120 }
1121 k += kh * kw * kc * nc;
1122 if XNN_UNPREDICTABLE(b != NULL) {
1123 b += nc;
1124 }
1125 }
1126}
1127
1128void xnn_pack_f16_deconv_goki_w(
1129 size_t g,
1130 size_t nc,
1131 size_t kh,
1132 size_t kw,
1133 size_t kc,
1134 size_t sh,
1135 size_t sw,
1136 size_t nr,
1137 size_t kr,
1138 size_t sr,
1139 const uint16_t* k,
1140 const uint16_t* b,
1141 uint16_t* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001142 struct subconvolution_params* subconv_params,
1143 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001144{
Marat Dukhanfbd67a72022-01-31 18:03:50 -08001145 assert(nr >= sr);
1146
Marat Dukhana6879bd2020-07-06 14:25:08 -07001147 const size_t skr = sr * kr;
Marat Dukhana6879bd2020-07-06 14:25:08 -07001148 for (size_t i = 0; i < g; i++) {
1149 for (size_t oy = 0; oy < sh; oy++) {
1150 for (size_t ox = 0; ox < sw; ox++) {
1151 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -07001152 (*subconv_params++).weights = packed_w;
Marat Dukhana6879bd2020-07-06 14:25:08 -07001153 }
1154 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1155 const size_t nr_block_size = min(nc - nr_block_start, nr);
1156 if XNN_LIKELY(b != NULL) {
1157 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1158 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
1159 }
1160 }
1161 packed_w += nr;
1162 for (size_t ky = oy; ky < kh; ky += sh) {
1163 for (size_t kx = ox; kx < kw; kx += sw) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -08001164 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001165 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1166 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -08001167 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1168 if (kc_idx < kc) {
1169 packed_w[kr_block_offset] = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1170 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001171 }
Marat Dukhanfbd67a72022-01-31 18:03:50 -08001172 packed_w += kr;
Marat Dukhana6879bd2020-07-06 14:25:08 -07001173 }
1174 packed_w += (nr - nr_block_size) * kr;
1175 }
1176 }
1177 }
1178 }
1179 }
1180 }
1181 k += kh * kw * kc * nc;
1182 if XNN_UNPREDICTABLE(b != NULL) {
1183 b += nc;
1184 }
1185 }
1186}
1187
Marat Dukhanbea849a2021-07-30 16:25:30 -07001188void xnn_pack_qs8_deconv_goki_w(
1189 size_t g,
1190 size_t nc,
1191 size_t kh,
1192 size_t kw,
1193 size_t kc,
1194 size_t sh,
1195 size_t sw,
1196 size_t nr,
1197 size_t kr,
1198 size_t sr,
1199 const int8_t* k,
1200 const int32_t* b,
1201 void* packed_w,
1202 struct subconvolution_params* subconv_params,
1203 const struct xnn_qs8_packing_params* params)
1204{
Marat Dukhanfbd67a72022-01-31 18:03:50 -08001205 assert(nr >= sr);
1206
Frank Barchard66ae2572021-11-02 17:36:21 -07001207 const size_t skr = sr * kr;
Marat Dukhanbea849a2021-07-30 16:25:30 -07001208 const int32_t izp = (int32_t) params->input_zero_point;
1209 for (size_t i = 0; i < g; i++) {
1210 for (size_t oy = 0; oy < sh; oy++) {
1211 for (size_t ox = 0; ox < sw; ox++) {
1212 if (i == 0) {
1213 (*subconv_params++).weights = packed_w;
1214 }
1215 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1216 const size_t nr_block_size = min(nc - nr_block_start, nr);
1217 int32_t* packed_b = (int32_t*) packed_w;
1218 if XNN_LIKELY(b != 0) {
1219 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1220 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
1221 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1222 }
1223 } else {
1224 size_t n = nr_block_size;
1225 do {
1226 *((int32_t*) packed_w) = 0;
1227 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1228 } while (--n != 0);
1229 }
1230 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1231 for (size_t ky = oy; ky < kh; ky += sh) {
1232 for (size_t kx = ox; kx < kw; kx += sw) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -08001233 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
Frank Barchard66ae2572021-11-02 17:36:21 -07001234 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1235 int32_t ksum = 0;
Marat Dukhanfbd67a72022-01-31 18:03:50 -08001236 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1237 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1238 if (kc_idx < kc) {
1239 const int8_t kv = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1240 ksum += (int32_t) kv;
1241 ((int8_t*) packed_w)[kr_block_offset] = kv;
1242 }
Frank Barchard66ae2572021-11-02 17:36:21 -07001243 }
1244 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanfbd67a72022-01-31 18:03:50 -08001245 packed_w = (int8_t*) packed_w + kr;
Frank Barchard66ae2572021-11-02 17:36:21 -07001246 }
Marat Dukhanfbd67a72022-01-31 18:03:50 -08001247 packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
Frank Barchard66ae2572021-11-02 17:36:21 -07001248 }
Marat Dukhanbea849a2021-07-30 16:25:30 -07001249 }
1250 }
1251 }
1252 }
1253 }
1254 k += kh * kw * kc * nc;
1255 if XNN_UNPREDICTABLE(b != NULL) {
1256 b += nc;
1257 }
1258 }
1259}
1260
Marat Dukhan08b7a972020-07-14 18:17:29 -07001261void xnn_pack_qu8_deconv_goki_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001262 size_t g,
1263 size_t nc,
1264 size_t kh,
1265 size_t kw,
1266 size_t kc,
1267 size_t sh,
1268 size_t sw,
1269 size_t nr,
1270 size_t kr,
Marat Dukhan5a698bb2020-07-07 20:47:55 -07001271 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -07001272 const uint8_t* k,
1273 const int32_t* b,
1274 void* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001275 struct subconvolution_params* subconv_params,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001276 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001277{
Marat Dukhan58cdcf22022-02-01 02:05:00 -08001278 assert(nr >= sr);
1279
1280 const size_t skr = sr * kr;
Marat Dukhanb42f8662020-07-06 20:46:13 -07001281 const int32_t izp = (int32_t) params->input_zero_point;
1282 const int32_t kzp = (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001283 for (size_t i = 0; i < g; i++) {
1284 for (size_t oy = 0; oy < sh; oy++) {
1285 for (size_t ox = 0; ox < sw; ox++) {
1286 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -07001287 (*subconv_params++).weights = packed_w;
Marat Dukhanab582382020-07-06 13:32:08 -07001288 }
Marat Dukhan58cdcf22022-02-01 02:05:00 -08001289 const int32_t bzp = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * izp * kzp;
Marat Dukhanab582382020-07-06 13:32:08 -07001290 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1291 const size_t nr_block_size = min(nc - nr_block_start, nr);
1292 int32_t* packed_b = (int32_t*) packed_w;
1293 if XNN_LIKELY(b != 0) {
1294 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
Marat Dukhan58cdcf22022-02-01 02:05:00 -08001295 *((int32_t*) packed_w) = bzp + b[nr_block_start + nr_block_offset];
Marat Dukhanab582382020-07-06 13:32:08 -07001296 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1297 }
1298 } else {
1299 size_t n = nr_block_size;
1300 do {
Marat Dukhan58cdcf22022-02-01 02:05:00 -08001301 *((int32_t*) packed_w) = bzp;
Marat Dukhanab582382020-07-06 13:32:08 -07001302 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1303 } while (--n != 0);
1304 }
1305 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1306 for (size_t ky = oy; ky < kh; ky += sh) {
1307 for (size_t kx = ox; kx < kw; kx += sw) {
Marat Dukhan58cdcf22022-02-01 02:05:00 -08001308 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
Marat Dukhanab582382020-07-06 13:32:08 -07001309 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1310 int32_t ksum = 0;
Marat Dukhan58cdcf22022-02-01 02:05:00 -08001311 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1312 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1313 if (kc_idx < kc) {
1314 const uint8_t kv = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1315 ksum += (int32_t) kv;
1316 ((uint8_t*) packed_w)[kr_block_offset] = kv;
1317 }
Marat Dukhanab582382020-07-06 13:32:08 -07001318 }
Marat Dukhanb42f8662020-07-06 20:46:13 -07001319 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhan58cdcf22022-02-01 02:05:00 -08001320 packed_w = (uint8_t*) packed_w + kr;
Marat Dukhanab582382020-07-06 13:32:08 -07001321 }
Marat Dukhan58cdcf22022-02-01 02:05:00 -08001322 packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
Marat Dukhanab582382020-07-06 13:32:08 -07001323 }
1324 }
1325 }
1326 }
1327 }
1328 }
1329 k += kh * kw * kc * nc;
1330 if XNN_UNPREDICTABLE(b != NULL) {
1331 b += nc;
1332 }
1333 }
1334}
1335
Marat Dukhana6879bd2020-07-06 14:25:08 -07001336void xnn_pack_f32_dwconv_ghw_w(
1337 size_t h,
1338 size_t w,
1339 size_t c,
1340 size_t cr,
1341 const float* k,
1342 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001343 float* packed_w,
Marat Dukhan82286892021-06-04 17:27:27 -07001344 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001345 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001346{
1347 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1348 const size_t cr_block_size = min(c - cr_block_start, cr);
1349 if XNN_LIKELY(b != NULL) {
1350 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1351 *packed_w++ = b[cr_block_start + cr_block_offset];
1352 }
1353 } else {
1354 size_t n = cr_block_size;
1355 do {
1356 *packed_w++ = 0.0f;
1357 } while (--n != 0);
1358 }
1359 packed_w += cr - cr_block_size;
1360 for (size_t x = 0; x < w; x++) {
1361 for (size_t y = 0; y < h; y++) {
1362 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1363 const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1364 *packed_w++ = kv;
1365 }
1366 packed_w += cr - cr_block_size;
1367 }
1368 }
Marat Dukhan82286892021-06-04 17:27:27 -07001369 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -07001370 }
1371}
1372
1373void xnn_pack_f16_dwconv_ghw_w(
1374 size_t h,
1375 size_t w,
1376 size_t c,
1377 size_t cr,
1378 const uint16_t* k,
1379 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001380 uint16_t* packed_w,
Marat Dukhan82286892021-06-04 17:27:27 -07001381 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001382 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001383{
1384 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1385 const size_t cr_block_size = min(c - cr_block_start, cr);
1386 if XNN_LIKELY(b != NULL) {
1387 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1388 *packed_w++ = b[cr_block_start + cr_block_offset];
1389 }
1390 } else {
1391 size_t n = cr_block_size;
1392 do {
1393 *packed_w++ = 0;
1394 } while (--n != 0);
1395 }
1396 packed_w += cr - cr_block_size;
1397 for (size_t x = 0; x < w; x++) {
1398 for (size_t y = 0; y < h; y++) {
1399 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1400 const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1401 *packed_w++ = kv;
1402 }
1403 packed_w += cr - cr_block_size;
1404 }
1405 }
Marat Dukhan82286892021-06-04 17:27:27 -07001406 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -07001407 }
1408}
1409
Marat Dukhan6989ec42022-01-14 17:14:35 -08001410void xnn_pack_f32_to_f16_dwconv_ghw_w(
1411 size_t h,
1412 size_t w,
1413 size_t c,
1414 size_t cr,
1415 const float* k,
1416 const float* b,
1417 uint16_t* packed_w,
1418 size_t extra_bytes,
1419 const void* params)
1420{
1421 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1422 const size_t cr_block_size = min(c - cr_block_start, cr);
1423 if XNN_LIKELY(b != NULL) {
1424 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1425 *packed_w++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
1426 }
1427 } else {
1428 size_t n = cr_block_size;
1429 do {
1430 *packed_w++ = 0;
1431 } while (--n != 0);
1432 }
1433 packed_w += cr - cr_block_size;
1434 for (size_t x = 0; x < w; x++) {
1435 for (size_t y = 0; y < h; y++) {
1436 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1437 const uint16_t kv = fp16_ieee_from_fp32_value(k[((cr_block_start + cr_block_offset) * h + y) * w + x]);
1438 *packed_w++ = kv;
1439 }
1440 packed_w += cr - cr_block_size;
1441 }
1442 }
1443 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
1444 }
1445}
1446
Marat Dukhan08b7a972020-07-14 18:17:29 -07001447void xnn_pack_qu8_dwconv_ghw_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001448 size_t h,
1449 size_t w,
1450 size_t c,
1451 size_t cr,
Marat Dukhanab582382020-07-06 13:32:08 -07001452 const uint8_t* k,
1453 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001454 void* packed_w,
Marat Dukhan82286892021-06-04 17:27:27 -07001455 size_t extra_bytes,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001456 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001457{
Marat Dukhanb42f8662020-07-06 20:46:13 -07001458 const int32_t izp = (int32_t) params->input_zero_point;
1459 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001460 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1461 const size_t cr_block_size = min(c - cr_block_start, cr);
1462 int32_t* packed_b = (int32_t*) packed_w;
1463 if XNN_LIKELY(b != NULL) {
1464 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1465 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1466 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1467 }
1468 } else {
1469 size_t n = cr_block_size;
1470 do {
1471 *((int32_t*) packed_w) = boff;
1472 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1473 } while (--n != 0);
1474 }
1475 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1476 for (size_t x = 0; x < w; x++) {
1477 for (size_t y = 0; y < h; y++) {
1478 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1479 const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
Marat Dukhanb42f8662020-07-06 20:46:13 -07001480 packed_b[cr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001481 *((uint8_t*) packed_w) = kv;
1482 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1483 }
1484 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1485 }
1486 }
Marat Dukhan82286892021-06-04 17:27:27 -07001487 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanab582382020-07-06 13:32:08 -07001488 }
1489}
1490
Marat Dukhanf62bbdc2020-08-04 13:59:04 -07001491void xnn_pack_qs8_dwconv_ghw_w(
1492 size_t h,
1493 size_t w,
1494 size_t c,
1495 size_t cr,
1496 const int8_t* k,
1497 const int32_t* b,
1498 void* packed_w,
Marat Dukhan82286892021-06-04 17:27:27 -07001499 size_t extra_bytes,
Marat Dukhanf62bbdc2020-08-04 13:59:04 -07001500 const struct xnn_qs8_packing_params* params)
1501{
1502 const int32_t izp = (int32_t) params->input_zero_point;
1503 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1504 const size_t cr_block_size = min(c - cr_block_start, cr);
1505 int32_t* packed_b = (int32_t*) packed_w;
1506 if XNN_LIKELY(b != NULL) {
1507 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1508 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
1509 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1510 }
1511 } else {
1512 size_t n = cr_block_size;
1513 do {
1514 *((int32_t*) packed_w) = 0;
1515 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1516 } while (--n != 0);
1517 }
1518 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1519 for (size_t x = 0; x < w; x++) {
1520 for (size_t y = 0; y < h; y++) {
1521 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1522 const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1523 packed_b[cr_block_offset] -= (int32_t) kv * izp;
1524 *((int8_t*) packed_w) = kv;
1525 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1526 }
1527 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1528 }
1529 }
Marat Dukhan82286892021-06-04 17:27:27 -07001530 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanf62bbdc2020-08-04 13:59:04 -07001531 }
1532}
1533
Marat Dukhana6879bd2020-07-06 14:25:08 -07001534void xnn_pack_f32_dwconv_hwg_w(
1535 size_t h,
1536 size_t w,
1537 size_t c,
1538 size_t cr,
1539 const float* k,
1540 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001541 float* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -07001542 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001543 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001544{
1545 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1546 const size_t cr_block_size = min(c - cr_block_start, cr);
1547 if XNN_LIKELY(b != NULL) {
1548 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1549 *packed_w++ = b[cr_block_start + cr_block_offset];
1550 }
1551 } else {
1552 size_t n = cr_block_size;
1553 do {
1554 *packed_w++ = 0.0f;
1555 } while (--n != 0);
1556 }
1557 packed_w += cr - cr_block_size;
1558 for (size_t x = 0; x < w; x++) {
1559 for (size_t y = 0; y < h; y++) {
1560 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1561 const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1562 *packed_w++ = kv;
1563 }
1564 packed_w += cr - cr_block_size;
1565 }
1566 }
Marat Dukhan97262462021-06-18 16:14:17 -07001567 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -07001568 }
1569}
1570
1571void xnn_pack_f16_dwconv_hwg_w(
1572 size_t h,
1573 size_t w,
1574 size_t c,
1575 size_t cr,
1576 const uint16_t* k,
1577 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001578 uint16_t* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -07001579 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001580 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001581{
1582 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1583 const size_t cr_block_size = min(c - cr_block_start, cr);
1584 if XNN_LIKELY(b != NULL) {
1585 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1586 *packed_w++ = b[cr_block_start + cr_block_offset];
1587 }
1588 } else {
1589 size_t n = cr_block_size;
1590 do {
1591 *packed_w++ = 0;
1592 } while (--n != 0);
1593 }
1594 packed_w += cr - cr_block_size;
1595 for (size_t x = 0; x < w; x++) {
1596 for (size_t y = 0; y < h; y++) {
1597 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1598 const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1599 *packed_w++ = kv;
1600 }
1601 packed_w += cr - cr_block_size;
1602 }
1603 }
Marat Dukhan97262462021-06-18 16:14:17 -07001604 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -07001605 }
1606}
1607
Marat Dukhan6989ec42022-01-14 17:14:35 -08001608void xnn_pack_f32_to_f16_dwconv_hwg_w(
1609 size_t h,
1610 size_t w,
1611 size_t c,
1612 size_t cr,
1613 const float* k,
1614 const float* b,
1615 uint16_t* packed_w,
1616 size_t extra_bytes,
1617 const void* params)
1618{
1619 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1620 const size_t cr_block_size = min(c - cr_block_start, cr);
1621 if XNN_LIKELY(b != NULL) {
1622 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1623 *packed_w++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
1624 }
1625 } else {
1626 size_t n = cr_block_size;
1627 do {
1628 *packed_w++ = 0;
1629 } while (--n != 0);
1630 }
1631 packed_w += cr - cr_block_size;
1632 for (size_t x = 0; x < w; x++) {
1633 for (size_t y = 0; y < h; y++) {
1634 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1635 const uint16_t kv = fp16_ieee_from_fp32_value(k[(y * w + x) * c + (cr_block_start + cr_block_offset)]);
1636 *packed_w++ = kv;
1637 }
1638 packed_w += cr - cr_block_size;
1639 }
1640 }
1641 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
1642 }
1643}
1644
Marat Dukhan08b7a972020-07-14 18:17:29 -07001645void xnn_pack_qu8_dwconv_hwg_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001646 size_t h,
1647 size_t w,
1648 size_t c,
1649 size_t cr,
Marat Dukhanab582382020-07-06 13:32:08 -07001650 const uint8_t* k,
1651 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001652 void* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -07001653 size_t extra_bytes,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001654 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001655{
Marat Dukhanb42f8662020-07-06 20:46:13 -07001656 const int32_t izp = (int32_t) params->input_zero_point;
1657 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001658 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1659 const size_t cr_block_size = min(c - cr_block_start, cr);
1660 int32_t* packed_b = (int32_t*) packed_w;
1661 if XNN_LIKELY(b != NULL) {
1662 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1663 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1664 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1665 }
1666 } else {
1667 size_t n = cr_block_size;
1668 do {
1669 *((int32_t*) packed_w) = boff;
1670 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1671 } while (--n != 0);
1672 }
1673 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1674 for (size_t x = 0; x < w; x++) {
1675 for (size_t y = 0; y < h; y++) {
1676 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1677 const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
Marat Dukhanb42f8662020-07-06 20:46:13 -07001678 packed_b[cr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001679 *((uint8_t*) packed_w) = kv;
1680 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1681 }
1682 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1683 }
1684 }
Marat Dukhan97262462021-06-18 16:14:17 -07001685 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanab582382020-07-06 13:32:08 -07001686 }
1687}
1688
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001689void xnn_pack_qs8_dwconv_hwg_w(
1690 size_t h,
1691 size_t w,
1692 size_t c,
1693 size_t cr,
1694 const int8_t* k,
1695 const int32_t* b,
1696 void* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -07001697 size_t extra_bytes,
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001698 const struct xnn_qs8_packing_params* params)
1699{
1700 const int32_t izp = (int32_t) params->input_zero_point;
1701 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1702 const size_t cr_block_size = min(c - cr_block_start, cr);
1703 int32_t* packed_b = (int32_t*) packed_w;
1704 if XNN_LIKELY(b != NULL) {
1705 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1706 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
1707 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1708 }
1709 } else {
1710 size_t n = cr_block_size;
1711 do {
1712 *((int32_t*) packed_w) = 0;
1713 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1714 } while (--n != 0);
1715 }
1716 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1717 for (size_t x = 0; x < w; x++) {
1718 for (size_t y = 0; y < h; y++) {
1719 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1720 const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1721 packed_b[cr_block_offset] -= (int32_t) kv * izp;
1722 *((int8_t*) packed_w) = kv;
1723 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1724 }
1725 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1726 }
1727 }
Marat Dukhan97262462021-06-18 16:14:17 -07001728 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001729 }
1730}
1731
Marat Dukhana6879bd2020-07-06 14:25:08 -07001732void xnn_pack_f32_gemminc_goi_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001733 size_t g,
1734 size_t nc,
1735 size_t kc,
1736 size_t nr,
1737 size_t kr,
1738 size_t sr,
Marat Dukhana6879bd2020-07-06 14:25:08 -07001739 const float* k,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001740 float* packed_w,
1741 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001742{
1743 const size_t skr = sr * kr;
Marat Dukhanab582382020-07-06 13:32:08 -07001744 do {
1745 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1746 const size_t nr_block_size = min(nc - nr_block_start, nr);
Marat Dukhanab582382020-07-06 13:32:08 -07001747
Marat Dukhanfbd67a72022-01-31 18:03:50 -08001748 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
Marat Dukhanab582382020-07-06 13:32:08 -07001749 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1750 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -08001751 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1752 if (kc_idx < kc) {
1753 packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
1754 }
Marat Dukhanab582382020-07-06 13:32:08 -07001755 }
Marat Dukhanfbd67a72022-01-31 18:03:50 -08001756 packed_w += kr;
Marat Dukhanab582382020-07-06 13:32:08 -07001757 }
1758 packed_w += (nr - nr_block_size) * kr;
1759 }
1760 }
1761 k += nc * kc;
Marat Dukhanab582382020-07-06 13:32:08 -07001762 } while (--g != 0);
1763}
1764
Marat Dukhanab582382020-07-06 13:32:08 -07001765void xnn_pack_f16_gemminc_goi_w(
1766 size_t g,
1767 size_t nc,
1768 size_t kc,
1769 size_t nr,
1770 size_t kr,
1771 size_t sr,
1772 const uint16_t* k,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001773 uint16_t* packed_w,
1774 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001775{
1776 const size_t skr = sr * kr;
Marat Dukhanab582382020-07-06 13:32:08 -07001777 do {
1778 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1779 const size_t nr_block_size = min(nc - nr_block_start, nr);
1780
Marat Dukhanfbd67a72022-01-31 18:03:50 -08001781 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
Marat Dukhanab582382020-07-06 13:32:08 -07001782 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1783 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
Marat Dukhanfbd67a72022-01-31 18:03:50 -08001784 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1785 if (kc_idx < kc) {
1786 packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
1787 }
Marat Dukhanab582382020-07-06 13:32:08 -07001788 }
Marat Dukhanfbd67a72022-01-31 18:03:50 -08001789 packed_w += kr;
Marat Dukhanab582382020-07-06 13:32:08 -07001790 }
1791 packed_w += (nr - nr_block_size) * kr;
1792 }
1793 }
1794 k += nc * kc;
1795 } while (--g != 0);
1796}
1797
Marat Dukhana6879bd2020-07-06 14:25:08 -07001798void xnn_pack_f32_dconv_oki_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001799 size_t nc,
Marat Dukhanab582382020-07-06 13:32:08 -07001800 size_t kc,
1801 size_t nr,
Marat Dukhana6879bd2020-07-06 14:25:08 -07001802 size_t kh,
1803 size_t kw,
1804 const float* k,
1805 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001806 float* packed_w,
1807 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001808{
Marat Dukhana6879bd2020-07-06 14:25:08 -07001809 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1810 const size_t nr_block_size = min(nc - nr_block_start, nr);
1811 if XNN_LIKELY(b != NULL) {
1812 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1813 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
Marat Dukhanab582382020-07-06 13:32:08 -07001814 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001815 } else {
1816 size_t n = nr;
1817 do {
1818 *packed_w++ = 0.0f;
1819 } while (--n != 0);
1820 }
Marat Dukhanab582382020-07-06 13:32:08 -07001821
Marat Dukhana6879bd2020-07-06 14:25:08 -07001822 for (size_t kx = 0; kx < kw; kx++) {
1823 for (size_t c = 0; c < kc; c++) {
1824 for (size_t ky = 0; ky < kh; ky++) {
1825 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1826 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
Marat Dukhanab582382020-07-06 13:32:08 -07001827 }
Marat Dukhanab582382020-07-06 13:32:08 -07001828 }
1829 }
1830 }
Marat Dukhanab582382020-07-06 13:32:08 -07001831 if XNN_UNPREDICTABLE(b != NULL) {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001832 b += nr;
Marat Dukhanab582382020-07-06 13:32:08 -07001833 }
1834 }
1835}
1836
1837void xnn_pack_f16_dconv_oki_w(
1838 size_t nc,
1839 size_t kc,
1840 size_t nr,
1841 size_t kh,
1842 size_t kw,
1843 const uint16_t* k,
1844 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001845 uint16_t* packed_w,
1846 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001847{
1848 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1849 const size_t nr_block_size = min(nc - nr_block_start, nr);
1850 if XNN_LIKELY(b != NULL) {
1851 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1852 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1853 }
1854 } else {
1855 size_t n = nr;
1856 do {
1857 *packed_w++ = 0;
1858 } while (--n != 0);
1859 }
1860
1861 for (size_t kx = 0; kx < kw; kx++) {
1862 for (size_t c = 0; c < kc; c++) {
1863 for (size_t ky = 0; ky < kh; ky++) {
1864 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1865 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1866 }
1867 }
1868 }
1869 }
1870 if XNN_UNPREDICTABLE(b != NULL) {
1871 b += nr;
1872 }
1873 }
1874}
1875
Marat Dukhana6879bd2020-07-06 14:25:08 -07001876void xnn_pack_f32_chw_dwconv_ghw_w(
1877 size_t kernel_size,
1878 size_t groups,
1879 const float* kernel,
1880 const float* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001881 float* packed_weights,
1882 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001883{
Marat Dukhana6879bd2020-07-06 14:25:08 -07001884 for (size_t g = 0; g < groups; g++) {
1885 if XNN_LIKELY(bias != NULL) {
1886 *packed_weights = *bias++;
Marat Dukhanab582382020-07-06 13:32:08 -07001887 } else {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001888 *packed_weights = 0.0f;
Marat Dukhanab582382020-07-06 13:32:08 -07001889 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001890 packed_weights += 1;
1891 for (size_t i = 0; i < kernel_size; i++) {
1892 *packed_weights++ = kernel[g * kernel_size + i];
Marat Dukhanab582382020-07-06 13:32:08 -07001893 }
1894 }
1895}
1896
1897void xnn_pack_f16_chw_dwconv_ghw_w(
1898 size_t kernel_size,
1899 size_t groups,
1900 const uint16_t* kernel,
1901 const uint16_t* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001902 uint16_t* packed_weights,
1903 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001904{
1905 for (size_t g = 0; g < groups; g++) {
1906 if XNN_LIKELY(bias != NULL) {
1907 *packed_weights = *bias++;
1908 } else {
1909 *packed_weights = 0;
1910 }
1911 packed_weights += 1;
1912 for (size_t i = 0; i < kernel_size; i++) {
1913 *packed_weights++ = kernel[g * kernel_size + i];
1914 }
1915 }
1916}
1917
Marat Dukhanab582382020-07-06 13:32:08 -07001918void xnn_pack_f32_chw_dwconv_hwg_w(
1919 size_t kernel_size,
1920 size_t groups,
1921 const float* kernel,
1922 const float* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001923 float* packed_weights,
1924 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001925{
1926 for (size_t g = 0; g < groups; g++) {
1927 if XNN_LIKELY(bias != NULL) {
1928 *packed_weights = *bias++;
1929 } else {
1930 *packed_weights = 0.0f;
1931 }
1932 packed_weights += 1;
1933 for (size_t i = 0; i < kernel_size; i++) {
1934 *packed_weights++ = kernel[i * groups + g];
1935 }
1936 }
1937}
1938
1939void xnn_pack_f32_vmulcaddc_w(
1940 size_t c,
1941 size_t cr,
1942 const float* s,
1943 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001944 float* packed_w,
1945 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001946{
1947 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1948 const size_t cr_block_size = min(c - cr_block_start, cr);
1949 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1950 *packed_w++ = s[cr_block_start + cr_block_offset];
1951 }
1952 packed_w += cr - cr_block_size;
1953 if XNN_LIKELY(b != NULL) {
1954 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1955 *packed_w++ = b[cr_block_start + cr_block_offset];
1956 }
1957 } else {
1958 size_t n = cr_block_size;
1959 do {
1960 *packed_w++ = 0.0f;
1961 } while (--n != 0);
1962 }
1963 packed_w += cr - cr_block_size;
1964 }
1965}
1966
1967void xnn_pack_f16_vmulcaddc_w(
1968 size_t c,
1969 size_t cr,
1970 const uint16_t* s,
1971 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001972 uint16_t* packed_w,
1973 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001974{
1975 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1976 const size_t cr_block_size = min(c - cr_block_start, cr);
1977 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1978 *packed_w++ = s[cr_block_start + cr_block_offset];
1979 }
1980 packed_w += cr - cr_block_size;
1981 if XNN_LIKELY(b != NULL) {
1982 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1983 *packed_w++ = b[cr_block_start + cr_block_offset];
1984 }
1985 } else {
1986 size_t n = cr_block_size;
1987 do {
1988 *packed_w++ = 0;
1989 } while (--n != 0);
1990 }
1991 packed_w += cr - cr_block_size;
1992 }
1993}
Marat Dukhan6989ec42022-01-14 17:14:35 -08001994
1995void xnn_pack_f32_to_f16_vmulcaddc_w(
1996 size_t c,
1997 size_t cr,
1998 const float* s,
1999 const float* b,
2000 uint16_t* packed_w,
2001 const void* params)
2002{
2003 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
2004 const size_t cr_block_size = min(c - cr_block_start, cr);
2005 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2006 *packed_w++ = fp16_ieee_from_fp32_value(s[cr_block_start + cr_block_offset]);
2007 }
2008 packed_w += cr - cr_block_size;
2009 if XNN_LIKELY(b != NULL) {
2010 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2011 *packed_w++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
2012 }
2013 } else {
2014 size_t n = cr_block_size;
2015 do {
2016 *packed_w++ = 0;
2017 } while (--n != 0);
2018 }
2019 packed_w += cr - cr_block_size;
2020 }
2021}
Marat Dukhanaf1671a2022-02-04 00:32:09 -08002022
2023void xnn_pack_f32_prelu_w(
2024 size_t c,
2025 const float* s,
2026 float* packed_w)
2027{
2028 memcpy(packed_w, s, c * sizeof(float));
2029}
2030
2031void xnn_pack_f16_prelu_w(
2032 size_t c,
2033 const uint16_t* s,
2034 uint16_t* packed_w)
2035{
2036 memcpy(packed_w, s, c * sizeof(uint16_t));
2037}
2038
2039void xnn_pack_f32_to_f16_prelu_w(
2040 size_t c,
2041 const float* s,
2042 uint16_t* packed_w)
2043{
2044 do {
2045 *packed_w++ = fp16_ieee_from_fp32_value(*s++);
2046 } while (--c != 0);
2047}