blob: c5eb5136c2a241ec5a533c881b99ba4e5547346e [file] [log] [blame]
Marat Dukhanab582382020-07-06 13:32:08 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <stdint.h>
10#include <stddef.h>
11
12#include <xnnpack/math.h>
13#include <xnnpack/pack.h>
14
15
Marat Dukhana6879bd2020-07-06 14:25:08 -070016void xnn_pack_f32_gemm_goi_w(
17 size_t g,
18 size_t nc,
19 size_t kc,
20 size_t nr,
21 size_t kr,
22 size_t sr,
23 const float* k,
24 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -070025 float* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -070026 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -070027 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -070028{
29 const size_t skr = sr * kr;
30 const size_t skc = round_down_po2(kc, skr);
31 const size_t sr_mask = (sr - 1) * kr;
32 do {
33 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
34 const size_t nr_block_size = min(nc - nr_block_start, nr);
35 if XNN_LIKELY(b != NULL) {
36 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
37 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
38 }
39 }
40 packed_w += nr;
41
42 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
43 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
44 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
45 *packed_w++ =
46 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
47 }
48 }
49 packed_w += (nr - nr_block_size) * kr;
50 }
51
52 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
53 const size_t kr_block_size = min(kc - kr_block_start, kr);
54 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
55 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
56 *packed_w++ =
57 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
58 }
59 packed_w += kr - kr_block_size;
60 }
61 packed_w += (nr - nr_block_size) * kr;
62 }
Marat Dukhane06c8132021-06-03 08:59:11 -070063 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -070064 }
65 k += nc * kc;
66 if XNN_UNPREDICTABLE(b != NULL) {
67 b += nc;
68 }
69 } while (--g != 0);
70}
71
72void xnn_pack_f16_gemm_goi_w(
73 size_t g,
74 size_t nc,
75 size_t kc,
76 size_t nr,
77 size_t kr,
78 size_t sr,
79 const uint16_t* k,
80 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -070081 uint16_t* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -070082 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -070083 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -070084{
85 const size_t skr = sr * kr;
86 const size_t skc = round_down_po2(kc, skr);
87 const size_t sr_mask = (sr - 1) * kr;
88 do {
89 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
90 const size_t nr_block_size = min(nc - nr_block_start, nr);
91 if XNN_LIKELY(b != NULL) {
92 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
93 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
94 }
95 }
96 packed_w += nr;
97
98 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
99 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
100 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
101 *packed_w++ =
102 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
103 }
104 }
105 packed_w += (nr - nr_block_size) * kr;
106 }
107
108 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
109 const size_t kr_block_size = min(kc - kr_block_start, kr);
110 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
111 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
112 *packed_w++ =
113 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
114 }
115 packed_w += kr - kr_block_size;
116 }
117 packed_w += (nr - nr_block_size) * kr;
118 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700119 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -0700120 }
121 k += nc * kc;
122 if XNN_UNPREDICTABLE(b != NULL) {
123 b += nc;
124 }
125 } while (--g != 0);
126}
127
Marat Dukhan08b7a972020-07-14 18:17:29 -0700128void xnn_pack_qu8_gemm_goi_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700129 size_t g,
130 size_t nc,
131 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700132 size_t nr,
133 size_t kr,
Marat Dukhanbc08f312020-07-07 16:22:04 -0700134 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700135 const uint8_t* k,
136 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700137 void* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -0700138 size_t extra_bytes,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700139 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700140{
Marat Dukhanbc08f312020-07-07 16:22:04 -0700141 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700142 const int32_t izp = (int32_t) params->input_zero_point;
143 const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700144 do {
145 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
146 const size_t nr_block_size = min(nc - nr_block_start, nr);
147 int32_t* packed_b = (int32_t*) packed_w;
148 if XNN_LIKELY(b != NULL) {
149 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
150 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
151 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
152 }
153 } else {
154 size_t n = nr_block_size;
155 do {
156 *((int32_t*) packed_w) = boff;
157 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
158 } while (--n != 0);
159 }
160 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
161 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
162 const size_t kr_block_size = min(kc - kr_block_start, kr);
163 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
164 int32_t ksum = 0;
165 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
166 const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
167 ksum += (int32_t) kv;
168 *((uint8_t*) packed_w) = kv;
169 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
170 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700171 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700172 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
173 }
174 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
175 }
Marat Dukhan0b043742021-06-02 18:29:11 -0700176 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanab582382020-07-06 13:32:08 -0700177 }
178 k += nc * kc;
179 if XNN_UNPREDICTABLE(b != NULL) {
180 b += nc;
181 }
182 } while (--g != 0);
183}
184
Marat Dukhan595e1702020-07-31 10:12:52 -0700185void xnn_pack_qs8_gemm_goi_w(
186 size_t g,
187 size_t nc,
188 size_t kc,
189 size_t nr,
190 size_t kr,
191 size_t sr,
192 const int8_t* k,
193 const int32_t* b,
194 void* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -0700195 size_t extra_bytes,
Marat Dukhan595e1702020-07-31 10:12:52 -0700196 const struct xnn_qs8_packing_params* params)
197{
198 assert(sr == 1);
199 const int32_t izp = (int32_t) params->input_zero_point;
200 do {
201 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
202 const size_t nr_block_size = min(nc - nr_block_start, nr);
203 int32_t* packed_b = (int32_t*) packed_w;
204 if XNN_LIKELY(b != NULL) {
205 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
206 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
207 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
208 }
209 } else {
210 size_t n = nr_block_size;
211 do {
212 *((int32_t*) packed_w) = 0;
213 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
214 } while (--n != 0);
215 }
216 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
217 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
218 const size_t kr_block_size = min(kc - kr_block_start, kr);
219 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
220 int32_t ksum = 0;
221 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
222 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
223 ksum += (int32_t) kv;
224 *((int8_t*) packed_w) = kv;
225 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
226 }
227 packed_b[nr_block_offset] -= ksum * izp;
228 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
229 }
230 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
231 }
Marat Dukhan0b043742021-06-02 18:29:11 -0700232 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhan595e1702020-07-31 10:12:52 -0700233 }
234 k += nc * kc;
235 if XNN_UNPREDICTABLE(b != NULL) {
236 b += nc;
237 }
238 } while (--g != 0);
239}
240
Marat Dukhan683fab32020-08-03 19:42:52 -0700241void xnn_pack_qs8_gemm_xw_goi_w(
242 size_t g,
243 size_t nc,
244 size_t kc,
245 size_t nr,
246 size_t kr,
247 size_t sr,
248 const int8_t* k,
249 const int32_t* b,
250 void* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -0700251 size_t extra_bytes,
Marat Dukhan683fab32020-08-03 19:42:52 -0700252 const struct xnn_qs8_packing_params* params)
253{
254 assert(sr == 1);
255 const int32_t izp = (int32_t) params->input_zero_point;
256 do {
257 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
258 const size_t nr_block_size = min(nc - nr_block_start, nr);
259 int32_t* packed_b = (int32_t*) packed_w;
260 if XNN_LIKELY(b != NULL) {
261 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
262 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
263 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
264 }
265 } else {
266 size_t n = nr_block_size;
267 do {
268 *((int32_t*) packed_w) = 0;
269 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
270 } while (--n != 0);
271 }
272 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
273 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
274 const size_t kr_block_size = min(kc - kr_block_start, kr);
275 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
276 int32_t ksum = 0;
277 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
278 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
279 ksum += (int32_t) kv;
280 *((int16_t*) packed_w) = (int16_t) kv;
281 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int16_t));
282 }
283 packed_b[nr_block_offset] -= ksum * izp;
284 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int16_t));
285 }
286 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int16_t));
287 }
Marat Dukhan0b043742021-06-02 18:29:11 -0700288 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhan683fab32020-08-03 19:42:52 -0700289 }
290 k += nc * kc;
291 if XNN_UNPREDICTABLE(b != NULL) {
292 b += nc;
293 }
294 } while (--g != 0);
295}
296
Marat Dukhana6879bd2020-07-06 14:25:08 -0700297void xnn_pack_f32_gemm_io_w(
298 size_t nc,
299 size_t kc,
300 size_t nr,
301 size_t kr,
302 size_t sr,
303 const float* k,
304 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700305 float* packed_w,
306 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700307{
308 const size_t skr = sr * kr;
309 const size_t skc = round_down_po2(kc, skr);
310 const size_t sr_mask = (sr - 1) * kr;
311 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
312 const size_t nr_block_size = min(nc - nr_block_start, nr);
313 if XNN_LIKELY(b != NULL) {
314 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
315 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
316 }
317 }
318 packed_w += nr;
319
320 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
321 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
322 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
323 *packed_w++ =
324 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
325 }
326 }
327 packed_w += (nr - nr_block_size) * kr;
328 }
329
330 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
331 const size_t kr_block_size = min(kc - kr_block_start, kr);
332 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
333 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
334 *packed_w++ =
335 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
336 }
337 packed_w += kr - kr_block_size;
338 }
339 packed_w += (nr - nr_block_size) * kr;
340 }
341 }
342}
343
344void xnn_pack_f16_gemm_io_w(
345 size_t nc,
346 size_t kc,
347 size_t nr,
348 size_t kr,
349 size_t sr,
350 const uint16_t* k,
351 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700352 uint16_t* packed_w,
353 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700354{
355 const size_t skr = sr * kr;
356 const size_t skc = round_down_po2(kc, skr);
357 const size_t sr_mask = (sr - 1) * kr;
358 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
359 const size_t nr_block_size = min(nc - nr_block_start, nr);
360 if XNN_LIKELY(b != NULL) {
361 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
362 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
363 }
364 }
365 packed_w += nr;
366
367 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
368 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
369 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
370 *packed_w++ =
371 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
372 }
373 }
374 packed_w += (nr - nr_block_size) * kr;
375 }
376
377 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
378 const size_t kr_block_size = min(kc - kr_block_start, kr);
379 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
380 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
381 *packed_w++ =
382 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
383 }
384 packed_w += kr - kr_block_size;
385 }
386 packed_w += (nr - nr_block_size) * kr;
387 }
388 }
389}
390
Marat Dukhan08b7a972020-07-14 18:17:29 -0700391void xnn_pack_qu8_gemm_io_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700392 size_t nc,
393 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700394 size_t nr,
395 size_t kr,
Marat Dukhanbc08f312020-07-07 16:22:04 -0700396 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700397 const uint8_t* k,
398 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700399 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700400 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700401{
Marat Dukhanbc08f312020-07-07 16:22:04 -0700402 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700403 const int32_t izp = (int32_t) params->input_zero_point;
404 const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700405 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
406 const size_t nr_block_size = min(nc - nr_block_start, nr);
407 int32_t* packed_b = (int32_t*) packed_w;
408 if XNN_LIKELY(b != NULL) {
409 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
410 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
411 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
412 }
413 } else {
414 size_t n = nr_block_size;
415 do {
416 *((int32_t*) packed_w) = boff;
417 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
418 } while (--n != 0);
419 }
420 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
421 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
422 const size_t kr_block_size = min(kc - kr_block_start, kr);
423 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
424 int32_t ksum = 0;
425 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
426 const uint8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
427 ksum += (int32_t) kv;
428 *((uint8_t*) packed_w) = kv;
429 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
430 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700431 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700432 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
433 }
434 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
435 }
436 }
437}
438
Marat Dukhand23cb6e2021-04-01 01:18:58 -0700439void xnn_pack_qs8_gemm_io_w(
440 size_t nc,
441 size_t kc,
442 size_t nr,
443 size_t kr,
444 size_t sr,
445 const int8_t* k,
446 const int32_t* b,
447 void* packed_w,
448 const struct xnn_qs8_packing_params* params)
449{
450 assert(sr == 1);
451 const int32_t izp = (int32_t) params->input_zero_point;
452 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
453 const size_t nr_block_size = min(nc - nr_block_start, nr);
454 int32_t* packed_b = (int32_t*) packed_w;
455 if XNN_LIKELY(b != NULL) {
456 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
457 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
458 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
459 }
460 } else {
461 size_t n = nr_block_size;
462 do {
463 *((int32_t*) packed_w) = 0;
464 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
465 } while (--n != 0);
466 }
467 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
468 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
469 const size_t kr_block_size = min(kc - kr_block_start, kr);
470 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
471 int32_t ksum = 0;
472 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
473 const int8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
474 ksum += (int32_t) kv;
475 *((int8_t*) packed_w) = kv;
476 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
477 }
478 packed_b[nr_block_offset] -= ksum * izp;
479 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
480 }
481 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
482 }
483 }
484}
485
Marat Dukhana6879bd2020-07-06 14:25:08 -0700486void xnn_pack_f32_conv_goki_w(
487 size_t g,
488 size_t nc,
489 size_t ks,
490 size_t kc,
491 size_t nr,
492 size_t kr,
493 size_t sr,
494 const float* k,
495 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700496 float* packed_w,
Marat Dukhane06c8132021-06-03 08:59:11 -0700497 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700498 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700499{
500 const size_t skr = sr * kr;
501 const size_t skc = round_down_po2(kc, skr);
502 const size_t sr_mask = (sr - 1) * kr;
503 do {
504 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
505 const size_t nr_block_size = min(nc - nr_block_start, nr);
506 if XNN_LIKELY(b != NULL) {
507 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
508 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
509 }
510 }
511 packed_w += nr;
512
513 for (size_t ki = 0; ki < ks; ki++) {
514 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
515 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
516 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
517 *packed_w++ =
518 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
519 }
520 }
521 packed_w += (nr - nr_block_size) * kr;
522 }
523
524 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
525 const size_t kr_block_size = min(kc - kr_block_start, kr);
526 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
527 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
528 *packed_w++ =
529 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
530 }
531 packed_w += kr - kr_block_size;
532 }
533 packed_w += (nr - nr_block_size) * kr;
534 }
535 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700536 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -0700537 }
538 k += ks * kc * nc;
539 if XNN_UNPREDICTABLE(b != NULL) {
540 b += nc;
541 }
542 } while (--g != 0);
543}
544
545void xnn_pack_f16_conv_goki_w(
546 size_t g,
547 size_t nc,
548 size_t ks,
549 size_t kc,
550 size_t nr,
551 size_t kr,
552 size_t sr,
553 const uint16_t* k,
554 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700555 uint16_t* packed_w,
Marat Dukhane06c8132021-06-03 08:59:11 -0700556 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700557 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700558{
559 const size_t skr = sr * kr;
560 const size_t skc = round_down_po2(kc, skr);
561 const size_t sr_mask = (sr - 1) * kr;
562 do {
563 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
564 const size_t nr_block_size = min(nc - nr_block_start, nr);
565 if XNN_LIKELY(b != NULL) {
566 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
567 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
568 }
569 }
570 packed_w += nr;
571
572 for (size_t ki = 0; ki < ks; ki++) {
573 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
574 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
575 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
576 *packed_w++ =
577 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
578 }
579 }
580 packed_w += (nr - nr_block_size) * kr;
581 }
582
583 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
584 const size_t kr_block_size = min(kc - kr_block_start, kr);
585 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
586 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
587 *packed_w++ =
588 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
589 }
590 packed_w += kr - kr_block_size;
591 }
592 packed_w += (nr - nr_block_size) * kr;
593 }
594 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700595 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -0700596 }
597 k += ks * kc * nc;
598 if XNN_UNPREDICTABLE(b != NULL) {
599 b += nc;
600 }
601 } while (--g != 0);
602}
603
Marat Dukhan08b7a972020-07-14 18:17:29 -0700604void xnn_pack_qu8_conv_goki_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700605 size_t g,
606 size_t nc,
607 size_t ks,
608 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700609 size_t nr,
610 size_t kr,
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700611 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700612 const uint8_t* k,
613 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700614 void* packed_w,
Marat Dukhane06c8132021-06-03 08:59:11 -0700615 size_t extra_bytes,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700616 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700617{
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700618 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700619 const int32_t izp = (int32_t) params->input_zero_point;
620 const int32_t boff = (int32_t) ks * (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700621 do {
622 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
623 const size_t nr_block_size = min(nc - nr_block_start, nr);
624 int32_t* packed_b = (int32_t*) packed_w;
625 if XNN_LIKELY(b != NULL) {
626 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
627 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
628 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
629 }
630 } else {
631 size_t n = nr_block_size;
632 do {
633 *((int32_t*) packed_w) = boff;
634 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
635 } while (--n != 0);
636 }
637 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
638 for (size_t ki = 0; ki < ks; ki++) {
639 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
640 const size_t kr_block_size = min(kc - kr_block_start, kr);
641 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
642 int32_t ksum = 0;
643 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
644 const uint8_t kv =
645 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
646 ksum += (int32_t) kv;
647 *((uint8_t*) packed_w) = kv;
648 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
649 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700650 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700651 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
652 }
653 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
654 }
655 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700656 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanab582382020-07-06 13:32:08 -0700657 }
658 k += ks * kc * nc;
659 if XNN_UNPREDICTABLE(b != NULL) {
660 b += nc;
661 }
662 } while (--g != 0);
663}
664
Marat Dukhanf9480682020-07-31 14:50:24 -0700665void xnn_pack_qs8_conv_goki_w(
666 size_t g,
667 size_t nc,
668 size_t ks,
669 size_t kc,
670 size_t nr,
671 size_t kr,
672 size_t sr,
673 const int8_t* k,
674 const int32_t* b,
675 void* packed_w,
Marat Dukhane06c8132021-06-03 08:59:11 -0700676 size_t extra_bytes,
Marat Dukhanf9480682020-07-31 14:50:24 -0700677 const struct xnn_qs8_packing_params* params)
678{
679 assert(sr == 1);
680 const int32_t izp = (int32_t) params->input_zero_point;
681 do {
682 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
683 const size_t nr_block_size = min(nc - nr_block_start, nr);
684 int32_t* packed_b = (int32_t*) packed_w;
685 if XNN_LIKELY(b != NULL) {
686 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
687 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
688 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
689 }
690 } else {
691 size_t n = nr_block_size;
692 do {
693 *((int32_t*) packed_w) = 0;
694 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
695 } while (--n != 0);
696 }
697 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
698 for (size_t ki = 0; ki < ks; ki++) {
699 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
700 const size_t kr_block_size = min(kc - kr_block_start, kr);
701 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
702 int32_t ksum = 0;
703 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
704 const int8_t kv =
705 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
706 ksum += (int32_t) kv;
707 *((int8_t*) packed_w) = kv;
708 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
709 }
710 packed_b[nr_block_offset] -= ksum * izp;
711 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
712 }
713 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
714 }
715 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700716 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanf9480682020-07-31 14:50:24 -0700717 }
718 k += ks * kc * nc;
719 if XNN_UNPREDICTABLE(b != NULL) {
720 b += nc;
721 }
722 } while (--g != 0);
723}
724
Marat Dukhana6879bd2020-07-06 14:25:08 -0700725void xnn_pack_f32_conv_kgo_w(
726 size_t g,
727 size_t nc,
728 size_t ks,
729 size_t nr,
730 size_t kr,
731 const float* k,
732 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700733 float* packed_w,
734 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700735{
736 for (size_t i = 0; i < g; i++) {
737 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
738 const size_t nr_block_size = min(nc - nr_block_start, nr);
739 if XNN_LIKELY(b != NULL) {
740 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
741 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
742 }
743 }
744 packed_w += nr;
745 for (size_t ki = 0; ki < ks; ki++) {
746 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
747 *packed_w =
748 k[ki * g * nc + (nr_block_start + nr_block_offset)];
749 packed_w += kr;
750 }
751 packed_w += (nr - nr_block_size) * kr;
752 }
753 }
754 k += nc;
755 if XNN_UNPREDICTABLE(b != NULL) {
756 b += nc;
757 }
758 }
759}
760
761void xnn_pack_f16_conv_kgo_w(
762 size_t g,
763 size_t nc,
764 size_t ks,
765 size_t nr,
766 size_t kr,
767 const uint16_t* k,
768 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700769 uint16_t* packed_w,
770 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700771{
772 for (size_t i = 0; i < g; i++) {
773 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
774 const size_t nr_block_size = min(nc - nr_block_start, nr);
775 if XNN_LIKELY(b != NULL) {
776 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
777 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
778 }
779 }
780 packed_w += nr;
781 for (size_t ki = 0; ki < ks; ki++) {
782 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
783 *packed_w =
784 k[ki * g * nc + (nr_block_start + nr_block_offset)];
785 packed_w += kr;
786 }
787 packed_w += (nr - nr_block_size) * kr;
788 }
789 }
790 k += nc;
791 if XNN_UNPREDICTABLE(b != NULL) {
792 b += nc;
793 }
794 }
795}
796
Marat Dukhan08b7a972020-07-14 18:17:29 -0700797void xnn_pack_qu8_conv_kgo_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700798 size_t g,
799 size_t nc,
800 size_t ks,
Marat Dukhana6879bd2020-07-06 14:25:08 -0700801 size_t nr,
802 size_t kr,
Marat Dukhanab582382020-07-06 13:32:08 -0700803 const uint8_t* k,
804 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700805 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700806 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700807{
Marat Dukhanb42f8662020-07-06 20:46:13 -0700808 const int32_t izp = (int32_t) params->input_zero_point;
809 const int32_t boff = (int32_t) ks * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700810 for (size_t i = 0; i < g; i++) {
811 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
812 const size_t nr_block_size = min(nc - nr_block_start, nr);
813 int32_t* packed_b = (int32_t*) packed_w;
814 if XNN_LIKELY(b != NULL) {
815 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
816 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
817 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
818 }
819 } else {
820 size_t n = nr_block_size;
821 do {
822 *((int32_t*) packed_w) = boff;
823 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
824 } while (--n != 0);
825 }
826 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
827 for (size_t ki = 0; ki < ks; ki++) {
828 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
829 const uint8_t kv =
830 k[ki * g * nc + (nr_block_start + nr_block_offset)];
831 *((uint8_t*) packed_w) = kv;
Marat Dukhanb42f8662020-07-06 20:46:13 -0700832 packed_b[nr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700833 packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(uint8_t));
834 }
835 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
836 }
837 }
838 k += nc;
839 if XNN_UNPREDICTABLE(b != NULL) {
840 b += nc;
841 }
842 }
843}
844
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700845void xnn_pack_qs8_conv_kgo_w(
846 size_t g,
847 size_t nc,
848 size_t ks,
849 size_t nr,
850 size_t kr,
851 const int8_t* k,
852 const int32_t* b,
853 void* packed_w,
854 const struct xnn_qs8_packing_params* params)
855{
856 const int32_t izp = (int32_t) params->input_zero_point;
857 for (size_t i = 0; i < g; i++) {
858 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
859 const size_t nr_block_size = min(nc - nr_block_start, nr);
860 int32_t* packed_b = (int32_t*) packed_w;
861 if XNN_LIKELY(b != NULL) {
862 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
863 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
864 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
865 }
866 } else {
867 size_t n = nr_block_size;
868 do {
869 *((int32_t*) packed_w) = 0;
870 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
871 } while (--n != 0);
872 }
873 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
874 for (size_t ki = 0; ki < ks; ki++) {
875 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
876 const int8_t kv =
877 k[ki * g * nc + (nr_block_start + nr_block_offset)];
878 *((int8_t*) packed_w) = kv;
879 packed_b[nr_block_offset] -= (int32_t) kv * izp;
880 packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(int8_t));
881 }
882 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
883 }
884 }
885 k += nc;
886 if XNN_UNPREDICTABLE(b != NULL) {
887 b += nc;
888 }
889 }
890}
891
Marat Dukhana6879bd2020-07-06 14:25:08 -0700892void xnn_pack_f32_deconv_goki_w(
893 size_t g,
894 size_t nc,
895 size_t kh,
896 size_t kw,
897 size_t kc,
898 size_t sh,
899 size_t sw,
900 size_t nr,
901 size_t kr,
902 size_t sr,
903 const float* k,
904 const float* b,
905 float* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700906 struct subconvolution_params* subconv_params,
907 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700908{
909 const size_t skr = sr * kr;
910 const size_t skc = round_down_po2(kc, skr);
911 const size_t sr_mask = (sr - 1) * kr;
912 for (size_t i = 0; i < g; i++) {
913 for (size_t oy = 0; oy < sh; oy++) {
914 for (size_t ox = 0; ox < sw; ox++) {
915 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -0700916 (*subconv_params++).weights = packed_w;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700917 }
918 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
919 const size_t nr_block_size = min(nc - nr_block_start, nr);
920 if XNN_LIKELY(b != NULL) {
921 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
922 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
923 }
924 }
925 packed_w += nr;
926 for (size_t ky = oy; ky < kh; ky += sh) {
927 for (size_t kx = ox; kx < kw; kx += sw) {
928 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
929 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
930 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
931 *packed_w++ =
932 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
933 }
934 }
935 packed_w += (nr - nr_block_size) * kr;
936 }
937
938 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
939 const size_t kr_block_size = min(kc - kr_block_start, kr);
940 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
941 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
942 *packed_w++ =
943 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
944 }
945 packed_w += kr - kr_block_size;
946 }
947 packed_w += (nr - nr_block_size) * kr;
948 }
949 }
950 }
951 }
952 }
953 }
954 k += kh * kw * kc * nc;
955 if XNN_UNPREDICTABLE(b != NULL) {
956 b += nc;
957 }
958 }
959}
960
961void xnn_pack_f16_deconv_goki_w(
962 size_t g,
963 size_t nc,
964 size_t kh,
965 size_t kw,
966 size_t kc,
967 size_t sh,
968 size_t sw,
969 size_t nr,
970 size_t kr,
971 size_t sr,
972 const uint16_t* k,
973 const uint16_t* b,
974 uint16_t* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700975 struct subconvolution_params* subconv_params,
976 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700977{
978 const size_t skr = sr * kr;
979 const size_t skc = round_down_po2(kc, skr);
980 const size_t sr_mask = (sr - 1) * kr;
981 for (size_t i = 0; i < g; i++) {
982 for (size_t oy = 0; oy < sh; oy++) {
983 for (size_t ox = 0; ox < sw; ox++) {
984 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -0700985 (*subconv_params++).weights = packed_w;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700986 }
987 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
988 const size_t nr_block_size = min(nc - nr_block_start, nr);
989 if XNN_LIKELY(b != NULL) {
990 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
991 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
992 }
993 }
994 packed_w += nr;
995 for (size_t ky = oy; ky < kh; ky += sh) {
996 for (size_t kx = ox; kx < kw; kx += sw) {
997 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
998 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
999 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1000 *packed_w++ =
1001 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1002 }
1003 }
1004 packed_w += (nr - nr_block_size) * kr;
1005 }
1006
1007 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1008 const size_t kr_block_size = min(kc - kr_block_start, kr);
1009 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1010 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1011 *packed_w++ =
1012 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
1013 }
1014 packed_w += kr - kr_block_size;
1015 }
1016 packed_w += (nr - nr_block_size) * kr;
1017 }
1018 }
1019 }
1020 }
1021 }
1022 }
1023 k += kh * kw * kc * nc;
1024 if XNN_UNPREDICTABLE(b != NULL) {
1025 b += nc;
1026 }
1027 }
1028}
1029
Marat Dukhan08b7a972020-07-14 18:17:29 -07001030void xnn_pack_qu8_deconv_goki_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001031 size_t g,
1032 size_t nc,
1033 size_t kh,
1034 size_t kw,
1035 size_t kc,
1036 size_t sh,
1037 size_t sw,
1038 size_t nr,
1039 size_t kr,
Marat Dukhan5a698bb2020-07-07 20:47:55 -07001040 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -07001041 const uint8_t* k,
1042 const int32_t* b,
1043 void* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001044 struct subconvolution_params* subconv_params,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001045 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001046{
Marat Dukhan5a698bb2020-07-07 20:47:55 -07001047 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -07001048 const int32_t izp = (int32_t) params->input_zero_point;
1049 const int32_t kzp = (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001050 for (size_t i = 0; i < g; i++) {
1051 for (size_t oy = 0; oy < sh; oy++) {
1052 for (size_t ox = 0; ox < sw; ox++) {
1053 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -07001054 (*subconv_params++).weights = packed_w;
Marat Dukhanab582382020-07-06 13:32:08 -07001055 }
Marat Dukhanb42f8662020-07-06 20:46:13 -07001056 const int32_t boff = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * izp * kzp;
Marat Dukhanab582382020-07-06 13:32:08 -07001057 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1058 const size_t nr_block_size = min(nc - nr_block_start, nr);
1059 int32_t* packed_b = (int32_t*) packed_w;
1060 if XNN_LIKELY(b != 0) {
1061 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1062 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
1063 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1064 }
1065 } else {
1066 size_t n = nr_block_size;
1067 do {
1068 *((int32_t*) packed_w) = boff;
1069 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1070 } while (--n != 0);
1071 }
1072 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1073 for (size_t ky = oy; ky < kh; ky += sh) {
1074 for (size_t kx = ox; kx < kw; kx += sw) {
1075 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
1076 const size_t kr_block_size = min(kc - kr_block_start, kr);
1077 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1078 int32_t ksum = 0;
1079 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1080 const uint8_t kv =
1081 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
1082 ksum += (int32_t) kv;
1083 *((uint8_t*) packed_w) = kv;
1084 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1085 }
Marat Dukhanb42f8662020-07-06 20:46:13 -07001086 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001087 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
1088 }
1089 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
1090 }
1091 }
1092 }
1093 }
1094 }
1095 }
1096 k += kh * kw * kc * nc;
1097 if XNN_UNPREDICTABLE(b != NULL) {
1098 b += nc;
1099 }
1100 }
1101}
1102
Marat Dukhana6879bd2020-07-06 14:25:08 -07001103void xnn_pack_f32_dwconv_ghw_w(
1104 size_t h,
1105 size_t w,
1106 size_t c,
1107 size_t cr,
1108 const float* k,
1109 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001110 float* packed_w,
Marat Dukhan82286892021-06-04 17:27:27 -07001111 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001112 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001113{
1114 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1115 const size_t cr_block_size = min(c - cr_block_start, cr);
1116 if XNN_LIKELY(b != NULL) {
1117 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1118 *packed_w++ = b[cr_block_start + cr_block_offset];
1119 }
1120 } else {
1121 size_t n = cr_block_size;
1122 do {
1123 *packed_w++ = 0.0f;
1124 } while (--n != 0);
1125 }
1126 packed_w += cr - cr_block_size;
1127 for (size_t x = 0; x < w; x++) {
1128 for (size_t y = 0; y < h; y++) {
1129 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1130 const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1131 *packed_w++ = kv;
1132 }
1133 packed_w += cr - cr_block_size;
1134 }
1135 }
Marat Dukhan82286892021-06-04 17:27:27 -07001136 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -07001137 }
1138}
1139
1140void xnn_pack_f16_dwconv_ghw_w(
1141 size_t h,
1142 size_t w,
1143 size_t c,
1144 size_t cr,
1145 const uint16_t* k,
1146 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001147 uint16_t* packed_w,
Marat Dukhan82286892021-06-04 17:27:27 -07001148 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001149 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001150{
1151 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1152 const size_t cr_block_size = min(c - cr_block_start, cr);
1153 if XNN_LIKELY(b != NULL) {
1154 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1155 *packed_w++ = b[cr_block_start + cr_block_offset];
1156 }
1157 } else {
1158 size_t n = cr_block_size;
1159 do {
1160 *packed_w++ = 0;
1161 } while (--n != 0);
1162 }
1163 packed_w += cr - cr_block_size;
1164 for (size_t x = 0; x < w; x++) {
1165 for (size_t y = 0; y < h; y++) {
1166 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1167 const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1168 *packed_w++ = kv;
1169 }
1170 packed_w += cr - cr_block_size;
1171 }
1172 }
Marat Dukhan82286892021-06-04 17:27:27 -07001173 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -07001174 }
1175}
1176
Marat Dukhan08b7a972020-07-14 18:17:29 -07001177void xnn_pack_qu8_dwconv_ghw_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001178 size_t h,
1179 size_t w,
1180 size_t c,
1181 size_t cr,
Marat Dukhanab582382020-07-06 13:32:08 -07001182 const uint8_t* k,
1183 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001184 void* packed_w,
Marat Dukhan82286892021-06-04 17:27:27 -07001185 size_t extra_bytes,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001186 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001187{
Marat Dukhanb42f8662020-07-06 20:46:13 -07001188 const int32_t izp = (int32_t) params->input_zero_point;
1189 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001190 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1191 const size_t cr_block_size = min(c - cr_block_start, cr);
1192 int32_t* packed_b = (int32_t*) packed_w;
1193 if XNN_LIKELY(b != NULL) {
1194 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1195 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1196 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1197 }
1198 } else {
1199 size_t n = cr_block_size;
1200 do {
1201 *((int32_t*) packed_w) = boff;
1202 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1203 } while (--n != 0);
1204 }
1205 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1206 for (size_t x = 0; x < w; x++) {
1207 for (size_t y = 0; y < h; y++) {
1208 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1209 const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
Marat Dukhanb42f8662020-07-06 20:46:13 -07001210 packed_b[cr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001211 *((uint8_t*) packed_w) = kv;
1212 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1213 }
1214 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1215 }
1216 }
Marat Dukhan82286892021-06-04 17:27:27 -07001217 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanab582382020-07-06 13:32:08 -07001218 }
1219}
1220
Marat Dukhanf62bbdc2020-08-04 13:59:04 -07001221void xnn_pack_qs8_dwconv_ghw_w(
1222 size_t h,
1223 size_t w,
1224 size_t c,
1225 size_t cr,
1226 const int8_t* k,
1227 const int32_t* b,
1228 void* packed_w,
Marat Dukhan82286892021-06-04 17:27:27 -07001229 size_t extra_bytes,
Marat Dukhanf62bbdc2020-08-04 13:59:04 -07001230 const struct xnn_qs8_packing_params* params)
1231{
1232 const int32_t izp = (int32_t) params->input_zero_point;
1233 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1234 const size_t cr_block_size = min(c - cr_block_start, cr);
1235 int32_t* packed_b = (int32_t*) packed_w;
1236 if XNN_LIKELY(b != NULL) {
1237 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1238 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
1239 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1240 }
1241 } else {
1242 size_t n = cr_block_size;
1243 do {
1244 *((int32_t*) packed_w) = 0;
1245 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1246 } while (--n != 0);
1247 }
1248 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1249 for (size_t x = 0; x < w; x++) {
1250 for (size_t y = 0; y < h; y++) {
1251 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1252 const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1253 packed_b[cr_block_offset] -= (int32_t) kv * izp;
1254 *((int8_t*) packed_w) = kv;
1255 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1256 }
1257 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1258 }
1259 }
Marat Dukhan82286892021-06-04 17:27:27 -07001260 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanf62bbdc2020-08-04 13:59:04 -07001261 }
1262}
1263
Marat Dukhana6879bd2020-07-06 14:25:08 -07001264void xnn_pack_f32_dwconv_hwg_w(
1265 size_t h,
1266 size_t w,
1267 size_t c,
1268 size_t cr,
1269 const float* k,
1270 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001271 float* packed_w,
1272 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001273{
1274 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1275 const size_t cr_block_size = min(c - cr_block_start, cr);
1276 if XNN_LIKELY(b != NULL) {
1277 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1278 *packed_w++ = b[cr_block_start + cr_block_offset];
1279 }
1280 } else {
1281 size_t n = cr_block_size;
1282 do {
1283 *packed_w++ = 0.0f;
1284 } while (--n != 0);
1285 }
1286 packed_w += cr - cr_block_size;
1287 for (size_t x = 0; x < w; x++) {
1288 for (size_t y = 0; y < h; y++) {
1289 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1290 const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1291 *packed_w++ = kv;
1292 }
1293 packed_w += cr - cr_block_size;
1294 }
1295 }
1296 }
1297}
1298
1299void xnn_pack_f16_dwconv_hwg_w(
1300 size_t h,
1301 size_t w,
1302 size_t c,
1303 size_t cr,
1304 const uint16_t* k,
1305 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001306 uint16_t* packed_w,
1307 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001308{
1309 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1310 const size_t cr_block_size = min(c - cr_block_start, cr);
1311 if XNN_LIKELY(b != NULL) {
1312 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1313 *packed_w++ = b[cr_block_start + cr_block_offset];
1314 }
1315 } else {
1316 size_t n = cr_block_size;
1317 do {
1318 *packed_w++ = 0;
1319 } while (--n != 0);
1320 }
1321 packed_w += cr - cr_block_size;
1322 for (size_t x = 0; x < w; x++) {
1323 for (size_t y = 0; y < h; y++) {
1324 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1325 const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1326 *packed_w++ = kv;
1327 }
1328 packed_w += cr - cr_block_size;
1329 }
1330 }
1331 }
1332}
1333
Marat Dukhan08b7a972020-07-14 18:17:29 -07001334void xnn_pack_qu8_dwconv_hwg_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001335 size_t h,
1336 size_t w,
1337 size_t c,
1338 size_t cr,
Marat Dukhanab582382020-07-06 13:32:08 -07001339 const uint8_t* k,
1340 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001341 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001342 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001343{
Marat Dukhanb42f8662020-07-06 20:46:13 -07001344 const int32_t izp = (int32_t) params->input_zero_point;
1345 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001346 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1347 const size_t cr_block_size = min(c - cr_block_start, cr);
1348 int32_t* packed_b = (int32_t*) packed_w;
1349 if XNN_LIKELY(b != NULL) {
1350 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1351 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1352 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1353 }
1354 } else {
1355 size_t n = cr_block_size;
1356 do {
1357 *((int32_t*) packed_w) = boff;
1358 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1359 } while (--n != 0);
1360 }
1361 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1362 for (size_t x = 0; x < w; x++) {
1363 for (size_t y = 0; y < h; y++) {
1364 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1365 const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
Marat Dukhanb42f8662020-07-06 20:46:13 -07001366 packed_b[cr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001367 *((uint8_t*) packed_w) = kv;
1368 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1369 }
1370 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1371 }
1372 }
1373 }
1374}
1375
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001376void xnn_pack_qs8_dwconv_hwg_w(
1377 size_t h,
1378 size_t w,
1379 size_t c,
1380 size_t cr,
1381 const int8_t* k,
1382 const int32_t* b,
1383 void* packed_w,
1384 const struct xnn_qs8_packing_params* params)
1385{
1386 const int32_t izp = (int32_t) params->input_zero_point;
1387 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1388 const size_t cr_block_size = min(c - cr_block_start, cr);
1389 int32_t* packed_b = (int32_t*) packed_w;
1390 if XNN_LIKELY(b != NULL) {
1391 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1392 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
1393 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1394 }
1395 } else {
1396 size_t n = cr_block_size;
1397 do {
1398 *((int32_t*) packed_w) = 0;
1399 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1400 } while (--n != 0);
1401 }
1402 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1403 for (size_t x = 0; x < w; x++) {
1404 for (size_t y = 0; y < h; y++) {
1405 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1406 const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1407 packed_b[cr_block_offset] -= (int32_t) kv * izp;
1408 *((int8_t*) packed_w) = kv;
1409 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1410 }
1411 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1412 }
1413 }
1414 }
1415}
1416
Marat Dukhana6879bd2020-07-06 14:25:08 -07001417void xnn_pack_f32_gemminc_goi_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001418 size_t g,
1419 size_t nc,
1420 size_t kc,
1421 size_t nr,
1422 size_t kr,
1423 size_t sr,
Marat Dukhana6879bd2020-07-06 14:25:08 -07001424 const float* k,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001425 float* packed_w,
1426 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001427{
1428 const size_t skr = sr * kr;
1429 const size_t skc = round_down_po2(kc, skr);
1430 const size_t sr_mask = (sr - 1) * kr;
1431 do {
1432 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1433 const size_t nr_block_size = min(nc - nr_block_start, nr);
Marat Dukhanab582382020-07-06 13:32:08 -07001434
1435 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1436 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1437 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1438 *packed_w++ =
1439 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1440 }
1441 }
1442 packed_w += (nr - nr_block_size) * kr;
1443 }
1444
1445 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1446 const size_t kr_block_size = min(kc - kr_block_start, kr);
1447 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1448 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1449 *packed_w++ =
1450 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1451 }
1452 packed_w += kr - kr_block_size;
1453 }
1454 packed_w += (nr - nr_block_size) * kr;
1455 }
1456 }
1457 k += nc * kc;
Marat Dukhanab582382020-07-06 13:32:08 -07001458 } while (--g != 0);
1459}
1460
Marat Dukhanab582382020-07-06 13:32:08 -07001461void xnn_pack_f16_gemminc_goi_w(
1462 size_t g,
1463 size_t nc,
1464 size_t kc,
1465 size_t nr,
1466 size_t kr,
1467 size_t sr,
1468 const uint16_t* k,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001469 uint16_t* packed_w,
1470 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001471{
1472 const size_t skr = sr * kr;
1473 const size_t skc = round_down_po2(kc, skr);
1474 const size_t sr_mask = (sr - 1) * kr;
1475 do {
1476 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1477 const size_t nr_block_size = min(nc - nr_block_start, nr);
1478
1479 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1480 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1481 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1482 *packed_w++ =
1483 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1484 }
1485 }
1486 packed_w += (nr - nr_block_size) * kr;
1487 }
1488
1489 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1490 const size_t kr_block_size = min(kc - kr_block_start, kr);
1491 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1492 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1493 *packed_w++ =
1494 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1495 }
1496 packed_w += kr - kr_block_size;
1497 }
1498 packed_w += (nr - nr_block_size) * kr;
1499 }
1500 }
1501 k += nc * kc;
1502 } while (--g != 0);
1503}
1504
Marat Dukhana6879bd2020-07-06 14:25:08 -07001505void xnn_pack_f32_dconv_oki_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001506 size_t nc,
Marat Dukhanab582382020-07-06 13:32:08 -07001507 size_t kc,
1508 size_t nr,
Marat Dukhana6879bd2020-07-06 14:25:08 -07001509 size_t kh,
1510 size_t kw,
1511 const float* k,
1512 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001513 float* packed_w,
1514 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001515{
Marat Dukhana6879bd2020-07-06 14:25:08 -07001516 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1517 const size_t nr_block_size = min(nc - nr_block_start, nr);
1518 if XNN_LIKELY(b != NULL) {
1519 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1520 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
Marat Dukhanab582382020-07-06 13:32:08 -07001521 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001522 } else {
1523 size_t n = nr;
1524 do {
1525 *packed_w++ = 0.0f;
1526 } while (--n != 0);
1527 }
Marat Dukhanab582382020-07-06 13:32:08 -07001528
Marat Dukhana6879bd2020-07-06 14:25:08 -07001529 for (size_t kx = 0; kx < kw; kx++) {
1530 for (size_t c = 0; c < kc; c++) {
1531 for (size_t ky = 0; ky < kh; ky++) {
1532 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1533 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
Marat Dukhanab582382020-07-06 13:32:08 -07001534 }
Marat Dukhanab582382020-07-06 13:32:08 -07001535 }
1536 }
1537 }
Marat Dukhanab582382020-07-06 13:32:08 -07001538 if XNN_UNPREDICTABLE(b != NULL) {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001539 b += nr;
Marat Dukhanab582382020-07-06 13:32:08 -07001540 }
1541 }
1542}
1543
1544void xnn_pack_f16_dconv_oki_w(
1545 size_t nc,
1546 size_t kc,
1547 size_t nr,
1548 size_t kh,
1549 size_t kw,
1550 const uint16_t* k,
1551 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001552 uint16_t* packed_w,
1553 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001554{
1555 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1556 const size_t nr_block_size = min(nc - nr_block_start, nr);
1557 if XNN_LIKELY(b != NULL) {
1558 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1559 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1560 }
1561 } else {
1562 size_t n = nr;
1563 do {
1564 *packed_w++ = 0;
1565 } while (--n != 0);
1566 }
1567
1568 for (size_t kx = 0; kx < kw; kx++) {
1569 for (size_t c = 0; c < kc; c++) {
1570 for (size_t ky = 0; ky < kh; ky++) {
1571 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1572 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1573 }
1574 }
1575 }
1576 }
1577 if XNN_UNPREDICTABLE(b != NULL) {
1578 b += nr;
1579 }
1580 }
1581}
1582
Marat Dukhana6879bd2020-07-06 14:25:08 -07001583void xnn_pack_f32_chw_dwconv_ghw_w(
1584 size_t kernel_size,
1585 size_t groups,
1586 const float* kernel,
1587 const float* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001588 float* packed_weights,
1589 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001590{
Marat Dukhana6879bd2020-07-06 14:25:08 -07001591 for (size_t g = 0; g < groups; g++) {
1592 if XNN_LIKELY(bias != NULL) {
1593 *packed_weights = *bias++;
Marat Dukhanab582382020-07-06 13:32:08 -07001594 } else {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001595 *packed_weights = 0.0f;
Marat Dukhanab582382020-07-06 13:32:08 -07001596 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001597 packed_weights += 1;
1598 for (size_t i = 0; i < kernel_size; i++) {
1599 *packed_weights++ = kernel[g * kernel_size + i];
Marat Dukhanab582382020-07-06 13:32:08 -07001600 }
1601 }
1602}
1603
1604void xnn_pack_f16_chw_dwconv_ghw_w(
1605 size_t kernel_size,
1606 size_t groups,
1607 const uint16_t* kernel,
1608 const uint16_t* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001609 uint16_t* packed_weights,
1610 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001611{
1612 for (size_t g = 0; g < groups; g++) {
1613 if XNN_LIKELY(bias != NULL) {
1614 *packed_weights = *bias++;
1615 } else {
1616 *packed_weights = 0;
1617 }
1618 packed_weights += 1;
1619 for (size_t i = 0; i < kernel_size; i++) {
1620 *packed_weights++ = kernel[g * kernel_size + i];
1621 }
1622 }
1623}
1624
Marat Dukhanab582382020-07-06 13:32:08 -07001625void xnn_pack_f32_chw_dwconv_hwg_w(
1626 size_t kernel_size,
1627 size_t groups,
1628 const float* kernel,
1629 const float* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001630 float* packed_weights,
1631 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001632{
1633 for (size_t g = 0; g < groups; g++) {
1634 if XNN_LIKELY(bias != NULL) {
1635 *packed_weights = *bias++;
1636 } else {
1637 *packed_weights = 0.0f;
1638 }
1639 packed_weights += 1;
1640 for (size_t i = 0; i < kernel_size; i++) {
1641 *packed_weights++ = kernel[i * groups + g];
1642 }
1643 }
1644}
1645
1646void xnn_pack_f32_vmulcaddc_w(
1647 size_t c,
1648 size_t cr,
1649 const float* s,
1650 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001651 float* packed_w,
1652 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001653{
1654 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1655 const size_t cr_block_size = min(c - cr_block_start, cr);
1656 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1657 *packed_w++ = s[cr_block_start + cr_block_offset];
1658 }
1659 packed_w += cr - cr_block_size;
1660 if XNN_LIKELY(b != NULL) {
1661 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1662 *packed_w++ = b[cr_block_start + cr_block_offset];
1663 }
1664 } else {
1665 size_t n = cr_block_size;
1666 do {
1667 *packed_w++ = 0.0f;
1668 } while (--n != 0);
1669 }
1670 packed_w += cr - cr_block_size;
1671 }
1672}
1673
1674void xnn_pack_f16_vmulcaddc_w(
1675 size_t c,
1676 size_t cr,
1677 const uint16_t* s,
1678 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001679 uint16_t* packed_w,
1680 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001681{
1682 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1683 const size_t cr_block_size = min(c - cr_block_start, cr);
1684 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1685 *packed_w++ = s[cr_block_start + cr_block_offset];
1686 }
1687 packed_w += cr - cr_block_size;
1688 if XNN_LIKELY(b != NULL) {
1689 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1690 *packed_w++ = b[cr_block_start + cr_block_offset];
1691 }
1692 } else {
1693 size_t n = cr_block_size;
1694 do {
1695 *packed_w++ = 0;
1696 } while (--n != 0);
1697 }
1698 packed_w += cr - cr_block_size;
1699 }
1700}