blob: 3aefbdde908ddb92aa65ad5e2b5c118cfdab570f [file] [log] [blame]
Marat Dukhanab582382020-07-06 13:32:08 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <stdint.h>
10#include <stddef.h>
Frank Barchard952cb512021-10-28 11:39:07 -070011#include <stdio.h> // for printf
Marat Dukhanab582382020-07-06 13:32:08 -070012
13#include <xnnpack/math.h>
14#include <xnnpack/pack.h>
15
16
Marat Dukhana6879bd2020-07-06 14:25:08 -070017void xnn_pack_f32_gemm_goi_w(
18 size_t g,
19 size_t nc,
20 size_t kc,
21 size_t nr,
22 size_t kr,
23 size_t sr,
24 const float* k,
25 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -070026 float* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -070027 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -070028 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -070029{
30 const size_t skr = sr * kr;
31 const size_t skc = round_down_po2(kc, skr);
32 const size_t sr_mask = (sr - 1) * kr;
33 do {
34 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
35 const size_t nr_block_size = min(nc - nr_block_start, nr);
36 if XNN_LIKELY(b != NULL) {
37 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
38 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
39 }
40 }
41 packed_w += nr;
42
43 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
44 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
45 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
46 *packed_w++ =
47 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
48 }
49 }
50 packed_w += (nr - nr_block_size) * kr;
51 }
52
53 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
54 const size_t kr_block_size = min(kc - kr_block_start, kr);
55 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
56 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
57 *packed_w++ =
58 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
59 }
60 packed_w += kr - kr_block_size;
61 }
62 packed_w += (nr - nr_block_size) * kr;
63 }
Marat Dukhane06c8132021-06-03 08:59:11 -070064 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -070065 }
66 k += nc * kc;
67 if XNN_UNPREDICTABLE(b != NULL) {
68 b += nc;
69 }
70 } while (--g != 0);
71}
72
73void xnn_pack_f16_gemm_goi_w(
74 size_t g,
75 size_t nc,
76 size_t kc,
77 size_t nr,
78 size_t kr,
79 size_t sr,
80 const uint16_t* k,
81 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -070082 uint16_t* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -070083 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -070084 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -070085{
86 const size_t skr = sr * kr;
87 const size_t skc = round_down_po2(kc, skr);
88 const size_t sr_mask = (sr - 1) * kr;
89 do {
90 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
91 const size_t nr_block_size = min(nc - nr_block_start, nr);
92 if XNN_LIKELY(b != NULL) {
93 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
94 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
95 }
96 }
97 packed_w += nr;
98
99 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
100 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
101 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
102 *packed_w++ =
103 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
104 }
105 }
106 packed_w += (nr - nr_block_size) * kr;
107 }
108
109 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
110 const size_t kr_block_size = min(kc - kr_block_start, kr);
111 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
112 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
113 *packed_w++ =
114 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
115 }
116 packed_w += kr - kr_block_size;
117 }
118 packed_w += (nr - nr_block_size) * kr;
119 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700120 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -0700121 }
122 k += nc * kc;
123 if XNN_UNPREDICTABLE(b != NULL) {
124 b += nc;
125 }
126 } while (--g != 0);
127}
128
Marat Dukhan08b7a972020-07-14 18:17:29 -0700129void xnn_pack_qu8_gemm_goi_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700130 size_t g,
131 size_t nc,
132 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700133 size_t nr,
134 size_t kr,
Marat Dukhanbc08f312020-07-07 16:22:04 -0700135 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700136 const uint8_t* k,
137 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700138 void* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -0700139 size_t extra_bytes,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700140 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700141{
Marat Dukhanbc08f312020-07-07 16:22:04 -0700142 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700143 const int32_t izp = (int32_t) params->input_zero_point;
144 const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700145 do {
146 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
147 const size_t nr_block_size = min(nc - nr_block_start, nr);
148 int32_t* packed_b = (int32_t*) packed_w;
149 if XNN_LIKELY(b != NULL) {
150 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
151 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
152 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
153 }
154 } else {
155 size_t n = nr_block_size;
156 do {
157 *((int32_t*) packed_w) = boff;
158 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
159 } while (--n != 0);
160 }
161 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
162 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
163 const size_t kr_block_size = min(kc - kr_block_start, kr);
164 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
165 int32_t ksum = 0;
166 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
167 const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
168 ksum += (int32_t) kv;
169 *((uint8_t*) packed_w) = kv;
170 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
171 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700172 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700173 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
174 }
175 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
176 }
Marat Dukhan0b043742021-06-02 18:29:11 -0700177 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanab582382020-07-06 13:32:08 -0700178 }
179 k += nc * kc;
180 if XNN_UNPREDICTABLE(b != NULL) {
181 b += nc;
182 }
183 } while (--g != 0);
184}
185
Marat Dukhan595e1702020-07-31 10:12:52 -0700186void xnn_pack_qs8_gemm_goi_w(
187 size_t g,
188 size_t nc,
189 size_t kc,
190 size_t nr,
191 size_t kr,
192 size_t sr,
193 const int8_t* k,
194 const int32_t* b,
195 void* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -0700196 size_t extra_bytes,
Marat Dukhan595e1702020-07-31 10:12:52 -0700197 const struct xnn_qs8_packing_params* params)
198{
Frank Barchard952cb512021-10-28 11:39:07 -0700199 const size_t skr = sr * kr;
200 const size_t skc = round_down_po2(kc, skr);
201 const size_t sr_mask = (sr - 1) * kr;
Marat Dukhan595e1702020-07-31 10:12:52 -0700202 const int32_t izp = (int32_t) params->input_zero_point;
203 do {
204 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
205 const size_t nr_block_size = min(nc - nr_block_start, nr);
206 int32_t* packed_b = (int32_t*) packed_w;
207 if XNN_LIKELY(b != NULL) {
208 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
209 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
210 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
211 }
212 } else {
213 size_t n = nr_block_size;
214 do {
215 *((int32_t*) packed_w) = 0;
216 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
217 } while (--n != 0);
218 }
219 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
Frank Barchard952cb512021-10-28 11:39:07 -0700220
221 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
222 const size_t kr_block_size = min(kc - kr_block_start, kr);
223 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
224 int32_t ksum = 0;
225 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
226 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset)];
227 ksum += (int32_t) kv;
228 *((int8_t*) packed_w) = kv;
229 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
230 }
231 packed_b[nr_block_offset] -= ksum * izp;
232 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
233 }
234 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
235 }
236
237 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
Marat Dukhan595e1702020-07-31 10:12:52 -0700238 const size_t kr_block_size = min(kc - kr_block_start, kr);
239 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
240 int32_t ksum = 0;
241 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
242 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
243 ksum += (int32_t) kv;
244 *((int8_t*) packed_w) = kv;
245 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
246 }
247 packed_b[nr_block_offset] -= ksum * izp;
248 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
249 }
250 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
251 }
Marat Dukhan0b043742021-06-02 18:29:11 -0700252 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhan595e1702020-07-31 10:12:52 -0700253 }
254 k += nc * kc;
255 if XNN_UNPREDICTABLE(b != NULL) {
256 b += nc;
257 }
258 } while (--g != 0);
259}
260
Marat Dukhan683fab32020-08-03 19:42:52 -0700261void xnn_pack_qs8_gemm_xw_goi_w(
262 size_t g,
263 size_t nc,
264 size_t kc,
265 size_t nr,
266 size_t kr,
267 size_t sr,
268 const int8_t* k,
269 const int32_t* b,
270 void* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -0700271 size_t extra_bytes,
Marat Dukhan683fab32020-08-03 19:42:52 -0700272 const struct xnn_qs8_packing_params* params)
273{
274 assert(sr == 1);
275 const int32_t izp = (int32_t) params->input_zero_point;
276 do {
277 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
278 const size_t nr_block_size = min(nc - nr_block_start, nr);
279 int32_t* packed_b = (int32_t*) packed_w;
280 if XNN_LIKELY(b != NULL) {
281 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
282 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
283 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
284 }
285 } else {
286 size_t n = nr_block_size;
287 do {
288 *((int32_t*) packed_w) = 0;
289 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
290 } while (--n != 0);
291 }
292 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
293 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
294 const size_t kr_block_size = min(kc - kr_block_start, kr);
295 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
296 int32_t ksum = 0;
297 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
298 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
299 ksum += (int32_t) kv;
300 *((int16_t*) packed_w) = (int16_t) kv;
301 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int16_t));
302 }
303 packed_b[nr_block_offset] -= ksum * izp;
304 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int16_t));
305 }
306 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int16_t));
307 }
Marat Dukhan0b043742021-06-02 18:29:11 -0700308 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhan683fab32020-08-03 19:42:52 -0700309 }
310 k += nc * kc;
311 if XNN_UNPREDICTABLE(b != NULL) {
312 b += nc;
313 }
314 } while (--g != 0);
315}
316
Marat Dukhana6879bd2020-07-06 14:25:08 -0700317void xnn_pack_f32_gemm_io_w(
318 size_t nc,
319 size_t kc,
320 size_t nr,
321 size_t kr,
322 size_t sr,
323 const float* k,
324 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700325 float* packed_w,
326 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700327{
328 const size_t skr = sr * kr;
329 const size_t skc = round_down_po2(kc, skr);
330 const size_t sr_mask = (sr - 1) * kr;
331 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
332 const size_t nr_block_size = min(nc - nr_block_start, nr);
333 if XNN_LIKELY(b != NULL) {
334 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
335 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
336 }
337 }
338 packed_w += nr;
339
340 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
341 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
342 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
343 *packed_w++ =
344 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
345 }
346 }
347 packed_w += (nr - nr_block_size) * kr;
348 }
349
350 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
351 const size_t kr_block_size = min(kc - kr_block_start, kr);
352 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
353 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
354 *packed_w++ =
355 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
356 }
357 packed_w += kr - kr_block_size;
358 }
359 packed_w += (nr - nr_block_size) * kr;
360 }
361 }
362}
363
364void xnn_pack_f16_gemm_io_w(
365 size_t nc,
366 size_t kc,
367 size_t nr,
368 size_t kr,
369 size_t sr,
370 const uint16_t* k,
371 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700372 uint16_t* packed_w,
373 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700374{
375 const size_t skr = sr * kr;
376 const size_t skc = round_down_po2(kc, skr);
377 const size_t sr_mask = (sr - 1) * kr;
378 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
379 const size_t nr_block_size = min(nc - nr_block_start, nr);
380 if XNN_LIKELY(b != NULL) {
381 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
382 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
383 }
384 }
385 packed_w += nr;
386
387 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
388 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
389 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
390 *packed_w++ =
391 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
392 }
393 }
394 packed_w += (nr - nr_block_size) * kr;
395 }
396
397 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
398 const size_t kr_block_size = min(kc - kr_block_start, kr);
399 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
400 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
401 *packed_w++ =
402 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
403 }
404 packed_w += kr - kr_block_size;
405 }
406 packed_w += (nr - nr_block_size) * kr;
407 }
408 }
409}
410
Marat Dukhan08b7a972020-07-14 18:17:29 -0700411void xnn_pack_qu8_gemm_io_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700412 size_t nc,
413 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700414 size_t nr,
415 size_t kr,
Marat Dukhanbc08f312020-07-07 16:22:04 -0700416 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700417 const uint8_t* k,
418 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700419 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700420 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700421{
Marat Dukhanbc08f312020-07-07 16:22:04 -0700422 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700423 const int32_t izp = (int32_t) params->input_zero_point;
424 const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700425 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
426 const size_t nr_block_size = min(nc - nr_block_start, nr);
427 int32_t* packed_b = (int32_t*) packed_w;
428 if XNN_LIKELY(b != NULL) {
429 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
430 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
431 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
432 }
433 } else {
434 size_t n = nr_block_size;
435 do {
436 *((int32_t*) packed_w) = boff;
437 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
438 } while (--n != 0);
439 }
440 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
441 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
442 const size_t kr_block_size = min(kc - kr_block_start, kr);
443 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
444 int32_t ksum = 0;
445 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
446 const uint8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
447 ksum += (int32_t) kv;
448 *((uint8_t*) packed_w) = kv;
449 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
450 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700451 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700452 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
453 }
454 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
455 }
456 }
457}
458
Marat Dukhand23cb6e2021-04-01 01:18:58 -0700459void xnn_pack_qs8_gemm_io_w(
460 size_t nc,
461 size_t kc,
462 size_t nr,
463 size_t kr,
464 size_t sr,
465 const int8_t* k,
466 const int32_t* b,
467 void* packed_w,
468 const struct xnn_qs8_packing_params* params)
469{
470 assert(sr == 1);
471 const int32_t izp = (int32_t) params->input_zero_point;
472 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
473 const size_t nr_block_size = min(nc - nr_block_start, nr);
474 int32_t* packed_b = (int32_t*) packed_w;
475 if XNN_LIKELY(b != NULL) {
476 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
477 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
478 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
479 }
480 } else {
481 size_t n = nr_block_size;
482 do {
483 *((int32_t*) packed_w) = 0;
484 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
485 } while (--n != 0);
486 }
487 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
488 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
489 const size_t kr_block_size = min(kc - kr_block_start, kr);
490 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
491 int32_t ksum = 0;
492 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
493 const int8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
494 ksum += (int32_t) kv;
495 *((int8_t*) packed_w) = kv;
496 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
497 }
498 packed_b[nr_block_offset] -= ksum * izp;
499 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
500 }
501 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
502 }
503 }
504}
505
Marat Dukhana6879bd2020-07-06 14:25:08 -0700506void xnn_pack_f32_conv_goki_w(
507 size_t g,
508 size_t nc,
509 size_t ks,
510 size_t kc,
511 size_t nr,
512 size_t kr,
513 size_t sr,
514 const float* k,
515 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700516 float* packed_w,
Marat Dukhane06c8132021-06-03 08:59:11 -0700517 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700518 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700519{
520 const size_t skr = sr * kr;
521 const size_t skc = round_down_po2(kc, skr);
522 const size_t sr_mask = (sr - 1) * kr;
523 do {
524 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
525 const size_t nr_block_size = min(nc - nr_block_start, nr);
526 if XNN_LIKELY(b != NULL) {
527 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
528 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
529 }
530 }
531 packed_w += nr;
532
533 for (size_t ki = 0; ki < ks; ki++) {
534 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
535 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
536 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
537 *packed_w++ =
538 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
539 }
540 }
541 packed_w += (nr - nr_block_size) * kr;
542 }
543
544 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
545 const size_t kr_block_size = min(kc - kr_block_start, kr);
546 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
547 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
548 *packed_w++ =
549 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
550 }
551 packed_w += kr - kr_block_size;
552 }
553 packed_w += (nr - nr_block_size) * kr;
554 }
555 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700556 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -0700557 }
558 k += ks * kc * nc;
559 if XNN_UNPREDICTABLE(b != NULL) {
560 b += nc;
561 }
562 } while (--g != 0);
563}
564
565void xnn_pack_f16_conv_goki_w(
566 size_t g,
567 size_t nc,
568 size_t ks,
569 size_t kc,
570 size_t nr,
571 size_t kr,
572 size_t sr,
573 const uint16_t* k,
574 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700575 uint16_t* packed_w,
Marat Dukhane06c8132021-06-03 08:59:11 -0700576 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700577 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700578{
579 const size_t skr = sr * kr;
580 const size_t skc = round_down_po2(kc, skr);
581 const size_t sr_mask = (sr - 1) * kr;
582 do {
583 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
584 const size_t nr_block_size = min(nc - nr_block_start, nr);
585 if XNN_LIKELY(b != NULL) {
586 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
587 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
588 }
589 }
590 packed_w += nr;
591
592 for (size_t ki = 0; ki < ks; ki++) {
593 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
594 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
595 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
596 *packed_w++ =
597 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
598 }
599 }
600 packed_w += (nr - nr_block_size) * kr;
601 }
602
603 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
604 const size_t kr_block_size = min(kc - kr_block_start, kr);
605 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
606 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
607 *packed_w++ =
608 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
609 }
610 packed_w += kr - kr_block_size;
611 }
612 packed_w += (nr - nr_block_size) * kr;
613 }
614 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700615 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -0700616 }
617 k += ks * kc * nc;
618 if XNN_UNPREDICTABLE(b != NULL) {
619 b += nc;
620 }
621 } while (--g != 0);
622}
623
Marat Dukhan08b7a972020-07-14 18:17:29 -0700624void xnn_pack_qu8_conv_goki_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700625 size_t g,
626 size_t nc,
627 size_t ks,
628 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700629 size_t nr,
630 size_t kr,
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700631 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700632 const uint8_t* k,
633 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700634 void* packed_w,
Marat Dukhane06c8132021-06-03 08:59:11 -0700635 size_t extra_bytes,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700636 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700637{
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700638 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700639 const int32_t izp = (int32_t) params->input_zero_point;
640 const int32_t boff = (int32_t) ks * (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700641 do {
642 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
643 const size_t nr_block_size = min(nc - nr_block_start, nr);
644 int32_t* packed_b = (int32_t*) packed_w;
645 if XNN_LIKELY(b != NULL) {
646 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
647 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
648 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
649 }
650 } else {
651 size_t n = nr_block_size;
652 do {
653 *((int32_t*) packed_w) = boff;
654 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
655 } while (--n != 0);
656 }
657 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
658 for (size_t ki = 0; ki < ks; ki++) {
659 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
660 const size_t kr_block_size = min(kc - kr_block_start, kr);
661 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
662 int32_t ksum = 0;
663 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
664 const uint8_t kv =
665 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
666 ksum += (int32_t) kv;
667 *((uint8_t*) packed_w) = kv;
668 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
669 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700670 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700671 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
672 }
673 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
674 }
675 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700676 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanab582382020-07-06 13:32:08 -0700677 }
678 k += ks * kc * nc;
679 if XNN_UNPREDICTABLE(b != NULL) {
680 b += nc;
681 }
682 } while (--g != 0);
683}
684
Marat Dukhanf9480682020-07-31 14:50:24 -0700685void xnn_pack_qs8_conv_goki_w(
686 size_t g,
687 size_t nc,
688 size_t ks,
689 size_t kc,
690 size_t nr,
691 size_t kr,
692 size_t sr,
693 const int8_t* k,
694 const int32_t* b,
695 void* packed_w,
Marat Dukhane06c8132021-06-03 08:59:11 -0700696 size_t extra_bytes,
Marat Dukhanf9480682020-07-31 14:50:24 -0700697 const struct xnn_qs8_packing_params* params)
698{
Frank Barchard952cb512021-10-28 11:39:07 -0700699 const size_t skr = sr * kr;
700 const size_t skc = round_down_po2(kc, skr);
701 const size_t sr_mask = (sr - 1) * kr;
Marat Dukhanf9480682020-07-31 14:50:24 -0700702 const int32_t izp = (int32_t) params->input_zero_point;
703 do {
704 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
705 const size_t nr_block_size = min(nc - nr_block_start, nr);
706 int32_t* packed_b = (int32_t*) packed_w;
707 if XNN_LIKELY(b != NULL) {
708 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
709 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
710 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
711 }
712 } else {
713 size_t n = nr_block_size;
714 do {
715 *((int32_t*) packed_w) = 0;
716 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
717 } while (--n != 0);
718 }
719 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
Frank Barchard952cb512021-10-28 11:39:07 -0700720
Marat Dukhanf9480682020-07-31 14:50:24 -0700721 for (size_t ki = 0; ki < ks; ki++) {
Frank Barchard952cb512021-10-28 11:39:07 -0700722 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
723 const size_t kr_block_size = min(kc - kr_block_start, kr);
724 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
725 int32_t ksum = 0;
726 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
727 const int8_t kv =
728 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset)];
729 ksum += (int32_t) kv;
730 *((int8_t*) packed_w) = kv;
731 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
732 }
733 packed_b[nr_block_offset] -= ksum * izp;
734 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
735 }
736 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
737 }
738
739 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
Marat Dukhanf9480682020-07-31 14:50:24 -0700740 const size_t kr_block_size = min(kc - kr_block_start, kr);
741 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
742 int32_t ksum = 0;
743 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
744 const int8_t kv =
745 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
746 ksum += (int32_t) kv;
747 *((int8_t*) packed_w) = kv;
748 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
749 }
750 packed_b[nr_block_offset] -= ksum * izp;
751 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
752 }
753 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
754 }
Frank Barchard952cb512021-10-28 11:39:07 -0700755
Marat Dukhanf9480682020-07-31 14:50:24 -0700756 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700757 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanf9480682020-07-31 14:50:24 -0700758 }
759 k += ks * kc * nc;
760 if XNN_UNPREDICTABLE(b != NULL) {
761 b += nc;
762 }
763 } while (--g != 0);
764}
765
Marat Dukhana6879bd2020-07-06 14:25:08 -0700766void xnn_pack_f32_conv_kgo_w(
767 size_t g,
768 size_t nc,
769 size_t ks,
770 size_t nr,
771 size_t kr,
772 const float* k,
773 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700774 float* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -0700775 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700776 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700777{
778 for (size_t i = 0; i < g; i++) {
779 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
780 const size_t nr_block_size = min(nc - nr_block_start, nr);
781 if XNN_LIKELY(b != NULL) {
782 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
783 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
784 }
785 }
786 packed_w += nr;
787 for (size_t ki = 0; ki < ks; ki++) {
788 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
789 *packed_w =
790 k[ki * g * nc + (nr_block_start + nr_block_offset)];
791 packed_w += kr;
792 }
793 packed_w += (nr - nr_block_size) * kr;
794 }
Marat Dukhan97262462021-06-18 16:14:17 -0700795 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -0700796 }
797 k += nc;
798 if XNN_UNPREDICTABLE(b != NULL) {
799 b += nc;
800 }
801 }
802}
803
804void xnn_pack_f16_conv_kgo_w(
805 size_t g,
806 size_t nc,
807 size_t ks,
808 size_t nr,
809 size_t kr,
810 const uint16_t* k,
811 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700812 uint16_t* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -0700813 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700814 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700815{
816 for (size_t i = 0; i < g; i++) {
817 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
818 const size_t nr_block_size = min(nc - nr_block_start, nr);
819 if XNN_LIKELY(b != NULL) {
820 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
821 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
822 }
823 }
824 packed_w += nr;
825 for (size_t ki = 0; ki < ks; ki++) {
826 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
827 *packed_w =
828 k[ki * g * nc + (nr_block_start + nr_block_offset)];
829 packed_w += kr;
830 }
831 packed_w += (nr - nr_block_size) * kr;
832 }
Marat Dukhan97262462021-06-18 16:14:17 -0700833 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -0700834 }
835 k += nc;
836 if XNN_UNPREDICTABLE(b != NULL) {
837 b += nc;
838 }
839 }
840}
841
Marat Dukhan08b7a972020-07-14 18:17:29 -0700842void xnn_pack_qu8_conv_kgo_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700843 size_t g,
844 size_t nc,
845 size_t ks,
Marat Dukhana6879bd2020-07-06 14:25:08 -0700846 size_t nr,
847 size_t kr,
Marat Dukhanab582382020-07-06 13:32:08 -0700848 const uint8_t* k,
849 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700850 void* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -0700851 size_t extra_bytes,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700852 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700853{
Marat Dukhanb42f8662020-07-06 20:46:13 -0700854 const int32_t izp = (int32_t) params->input_zero_point;
855 const int32_t boff = (int32_t) ks * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700856 for (size_t i = 0; i < g; i++) {
857 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
858 const size_t nr_block_size = min(nc - nr_block_start, nr);
859 int32_t* packed_b = (int32_t*) packed_w;
860 if XNN_LIKELY(b != NULL) {
861 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
862 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
863 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
864 }
865 } else {
866 size_t n = nr_block_size;
867 do {
868 *((int32_t*) packed_w) = boff;
869 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
870 } while (--n != 0);
871 }
872 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
873 for (size_t ki = 0; ki < ks; ki++) {
874 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
875 const uint8_t kv =
876 k[ki * g * nc + (nr_block_start + nr_block_offset)];
877 *((uint8_t*) packed_w) = kv;
Marat Dukhanb42f8662020-07-06 20:46:13 -0700878 packed_b[nr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700879 packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(uint8_t));
880 }
881 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
882 }
Marat Dukhan97262462021-06-18 16:14:17 -0700883 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanab582382020-07-06 13:32:08 -0700884 }
885 k += nc;
886 if XNN_UNPREDICTABLE(b != NULL) {
887 b += nc;
888 }
889 }
890}
891
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700892void xnn_pack_qs8_conv_kgo_w(
893 size_t g,
894 size_t nc,
895 size_t ks,
896 size_t nr,
897 size_t kr,
898 const int8_t* k,
899 const int32_t* b,
900 void* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -0700901 size_t extra_bytes,
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700902 const struct xnn_qs8_packing_params* params)
903{
904 const int32_t izp = (int32_t) params->input_zero_point;
905 for (size_t i = 0; i < g; i++) {
906 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
907 const size_t nr_block_size = min(nc - nr_block_start, nr);
908 int32_t* packed_b = (int32_t*) packed_w;
909 if XNN_LIKELY(b != NULL) {
910 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
911 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
912 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
913 }
914 } else {
915 size_t n = nr_block_size;
916 do {
917 *((int32_t*) packed_w) = 0;
918 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
919 } while (--n != 0);
920 }
921 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
922 for (size_t ki = 0; ki < ks; ki++) {
923 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
924 const int8_t kv =
925 k[ki * g * nc + (nr_block_start + nr_block_offset)];
926 *((int8_t*) packed_w) = kv;
927 packed_b[nr_block_offset] -= (int32_t) kv * izp;
928 packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(int8_t));
929 }
930 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
931 }
Marat Dukhan97262462021-06-18 16:14:17 -0700932 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700933 }
934 k += nc;
935 if XNN_UNPREDICTABLE(b != NULL) {
936 b += nc;
937 }
938 }
939}
940
Marat Dukhana6879bd2020-07-06 14:25:08 -0700941void xnn_pack_f32_deconv_goki_w(
942 size_t g,
943 size_t nc,
944 size_t kh,
945 size_t kw,
946 size_t kc,
947 size_t sh,
948 size_t sw,
949 size_t nr,
950 size_t kr,
951 size_t sr,
952 const float* k,
953 const float* b,
954 float* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700955 struct subconvolution_params* subconv_params,
956 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700957{
958 const size_t skr = sr * kr;
959 const size_t skc = round_down_po2(kc, skr);
960 const size_t sr_mask = (sr - 1) * kr;
961 for (size_t i = 0; i < g; i++) {
962 for (size_t oy = 0; oy < sh; oy++) {
963 for (size_t ox = 0; ox < sw; ox++) {
964 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -0700965 (*subconv_params++).weights = packed_w;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700966 }
967 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
968 const size_t nr_block_size = min(nc - nr_block_start, nr);
969 if XNN_LIKELY(b != NULL) {
970 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
971 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
972 }
973 }
974 packed_w += nr;
975 for (size_t ky = oy; ky < kh; ky += sh) {
976 for (size_t kx = ox; kx < kw; kx += sw) {
977 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
978 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
979 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
980 *packed_w++ =
981 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
982 }
983 }
984 packed_w += (nr - nr_block_size) * kr;
985 }
986
987 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
988 const size_t kr_block_size = min(kc - kr_block_start, kr);
989 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
990 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
991 *packed_w++ =
992 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
993 }
994 packed_w += kr - kr_block_size;
995 }
996 packed_w += (nr - nr_block_size) * kr;
997 }
998 }
999 }
1000 }
1001 }
1002 }
1003 k += kh * kw * kc * nc;
1004 if XNN_UNPREDICTABLE(b != NULL) {
1005 b += nc;
1006 }
1007 }
1008}
1009
1010void xnn_pack_f16_deconv_goki_w(
1011 size_t g,
1012 size_t nc,
1013 size_t kh,
1014 size_t kw,
1015 size_t kc,
1016 size_t sh,
1017 size_t sw,
1018 size_t nr,
1019 size_t kr,
1020 size_t sr,
1021 const uint16_t* k,
1022 const uint16_t* b,
1023 uint16_t* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001024 struct subconvolution_params* subconv_params,
1025 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001026{
1027 const size_t skr = sr * kr;
1028 const size_t skc = round_down_po2(kc, skr);
1029 const size_t sr_mask = (sr - 1) * kr;
1030 for (size_t i = 0; i < g; i++) {
1031 for (size_t oy = 0; oy < sh; oy++) {
1032 for (size_t ox = 0; ox < sw; ox++) {
1033 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -07001034 (*subconv_params++).weights = packed_w;
Marat Dukhana6879bd2020-07-06 14:25:08 -07001035 }
1036 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1037 const size_t nr_block_size = min(nc - nr_block_start, nr);
1038 if XNN_LIKELY(b != NULL) {
1039 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1040 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
1041 }
1042 }
1043 packed_w += nr;
1044 for (size_t ky = oy; ky < kh; ky += sh) {
1045 for (size_t kx = ox; kx < kw; kx += sw) {
1046 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1047 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1048 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1049 *packed_w++ =
1050 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1051 }
1052 }
1053 packed_w += (nr - nr_block_size) * kr;
1054 }
1055
1056 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1057 const size_t kr_block_size = min(kc - kr_block_start, kr);
1058 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1059 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1060 *packed_w++ =
1061 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
1062 }
1063 packed_w += kr - kr_block_size;
1064 }
1065 packed_w += (nr - nr_block_size) * kr;
1066 }
1067 }
1068 }
1069 }
1070 }
1071 }
1072 k += kh * kw * kc * nc;
1073 if XNN_UNPREDICTABLE(b != NULL) {
1074 b += nc;
1075 }
1076 }
1077}
1078
Marat Dukhanbea849a2021-07-30 16:25:30 -07001079void xnn_pack_qs8_deconv_goki_w(
1080 size_t g,
1081 size_t nc,
1082 size_t kh,
1083 size_t kw,
1084 size_t kc,
1085 size_t sh,
1086 size_t sw,
1087 size_t nr,
1088 size_t kr,
1089 size_t sr,
1090 const int8_t* k,
1091 const int32_t* b,
1092 void* packed_w,
1093 struct subconvolution_params* subconv_params,
1094 const struct xnn_qs8_packing_params* params)
1095{
1096 assert(sr == 1);
1097 const int32_t izp = (int32_t) params->input_zero_point;
1098 for (size_t i = 0; i < g; i++) {
1099 for (size_t oy = 0; oy < sh; oy++) {
1100 for (size_t ox = 0; ox < sw; ox++) {
1101 if (i == 0) {
1102 (*subconv_params++).weights = packed_w;
1103 }
1104 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1105 const size_t nr_block_size = min(nc - nr_block_start, nr);
1106 int32_t* packed_b = (int32_t*) packed_w;
1107 if XNN_LIKELY(b != 0) {
1108 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1109 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
1110 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1111 }
1112 } else {
1113 size_t n = nr_block_size;
1114 do {
1115 *((int32_t*) packed_w) = 0;
1116 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1117 } while (--n != 0);
1118 }
1119 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1120 for (size_t ky = oy; ky < kh; ky += sh) {
1121 for (size_t kx = ox; kx < kw; kx += sw) {
1122 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
1123 const size_t kr_block_size = min(kc - kr_block_start, kr);
1124 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1125 int32_t ksum = 0;
1126 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1127 const int8_t kv =
1128 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
1129 ksum += (int32_t) kv;
1130 *((int8_t*) packed_w) = kv;
1131 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1132 }
1133 packed_b[nr_block_offset] -= ksum * izp;
1134 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
1135 }
1136 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
1137 }
1138 }
1139 }
1140 }
1141 }
1142 }
1143 k += kh * kw * kc * nc;
1144 if XNN_UNPREDICTABLE(b != NULL) {
1145 b += nc;
1146 }
1147 }
1148}
1149
Marat Dukhan08b7a972020-07-14 18:17:29 -07001150void xnn_pack_qu8_deconv_goki_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001151 size_t g,
1152 size_t nc,
1153 size_t kh,
1154 size_t kw,
1155 size_t kc,
1156 size_t sh,
1157 size_t sw,
1158 size_t nr,
1159 size_t kr,
Marat Dukhan5a698bb2020-07-07 20:47:55 -07001160 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -07001161 const uint8_t* k,
1162 const int32_t* b,
1163 void* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001164 struct subconvolution_params* subconv_params,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001165 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001166{
Marat Dukhan5a698bb2020-07-07 20:47:55 -07001167 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -07001168 const int32_t izp = (int32_t) params->input_zero_point;
1169 const int32_t kzp = (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001170 for (size_t i = 0; i < g; i++) {
1171 for (size_t oy = 0; oy < sh; oy++) {
1172 for (size_t ox = 0; ox < sw; ox++) {
1173 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -07001174 (*subconv_params++).weights = packed_w;
Marat Dukhanab582382020-07-06 13:32:08 -07001175 }
Marat Dukhanb42f8662020-07-06 20:46:13 -07001176 const int32_t boff = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * izp * kzp;
Marat Dukhanab582382020-07-06 13:32:08 -07001177 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1178 const size_t nr_block_size = min(nc - nr_block_start, nr);
1179 int32_t* packed_b = (int32_t*) packed_w;
1180 if XNN_LIKELY(b != 0) {
1181 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1182 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
1183 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1184 }
1185 } else {
1186 size_t n = nr_block_size;
1187 do {
1188 *((int32_t*) packed_w) = boff;
1189 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1190 } while (--n != 0);
1191 }
1192 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1193 for (size_t ky = oy; ky < kh; ky += sh) {
1194 for (size_t kx = ox; kx < kw; kx += sw) {
1195 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
1196 const size_t kr_block_size = min(kc - kr_block_start, kr);
1197 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1198 int32_t ksum = 0;
1199 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1200 const uint8_t kv =
1201 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
1202 ksum += (int32_t) kv;
1203 *((uint8_t*) packed_w) = kv;
1204 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1205 }
Marat Dukhanb42f8662020-07-06 20:46:13 -07001206 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001207 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
1208 }
1209 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
1210 }
1211 }
1212 }
1213 }
1214 }
1215 }
1216 k += kh * kw * kc * nc;
1217 if XNN_UNPREDICTABLE(b != NULL) {
1218 b += nc;
1219 }
1220 }
1221}
1222
Marat Dukhana6879bd2020-07-06 14:25:08 -07001223void xnn_pack_f32_dwconv_ghw_w(
1224 size_t h,
1225 size_t w,
1226 size_t c,
1227 size_t cr,
1228 const float* k,
1229 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001230 float* packed_w,
Marat Dukhan82286892021-06-04 17:27:27 -07001231 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001232 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001233{
1234 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1235 const size_t cr_block_size = min(c - cr_block_start, cr);
1236 if XNN_LIKELY(b != NULL) {
1237 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1238 *packed_w++ = b[cr_block_start + cr_block_offset];
1239 }
1240 } else {
1241 size_t n = cr_block_size;
1242 do {
1243 *packed_w++ = 0.0f;
1244 } while (--n != 0);
1245 }
1246 packed_w += cr - cr_block_size;
1247 for (size_t x = 0; x < w; x++) {
1248 for (size_t y = 0; y < h; y++) {
1249 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1250 const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1251 *packed_w++ = kv;
1252 }
1253 packed_w += cr - cr_block_size;
1254 }
1255 }
Marat Dukhan82286892021-06-04 17:27:27 -07001256 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -07001257 }
1258}
1259
1260void xnn_pack_f16_dwconv_ghw_w(
1261 size_t h,
1262 size_t w,
1263 size_t c,
1264 size_t cr,
1265 const uint16_t* k,
1266 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001267 uint16_t* packed_w,
Marat Dukhan82286892021-06-04 17:27:27 -07001268 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001269 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001270{
1271 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1272 const size_t cr_block_size = min(c - cr_block_start, cr);
1273 if XNN_LIKELY(b != NULL) {
1274 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1275 *packed_w++ = b[cr_block_start + cr_block_offset];
1276 }
1277 } else {
1278 size_t n = cr_block_size;
1279 do {
1280 *packed_w++ = 0;
1281 } while (--n != 0);
1282 }
1283 packed_w += cr - cr_block_size;
1284 for (size_t x = 0; x < w; x++) {
1285 for (size_t y = 0; y < h; y++) {
1286 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1287 const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1288 *packed_w++ = kv;
1289 }
1290 packed_w += cr - cr_block_size;
1291 }
1292 }
Marat Dukhan82286892021-06-04 17:27:27 -07001293 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -07001294 }
1295}
1296
Marat Dukhan08b7a972020-07-14 18:17:29 -07001297void xnn_pack_qu8_dwconv_ghw_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001298 size_t h,
1299 size_t w,
1300 size_t c,
1301 size_t cr,
Marat Dukhanab582382020-07-06 13:32:08 -07001302 const uint8_t* k,
1303 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001304 void* packed_w,
Marat Dukhan82286892021-06-04 17:27:27 -07001305 size_t extra_bytes,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001306 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001307{
Marat Dukhanb42f8662020-07-06 20:46:13 -07001308 const int32_t izp = (int32_t) params->input_zero_point;
1309 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001310 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1311 const size_t cr_block_size = min(c - cr_block_start, cr);
1312 int32_t* packed_b = (int32_t*) packed_w;
1313 if XNN_LIKELY(b != NULL) {
1314 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1315 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1316 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1317 }
1318 } else {
1319 size_t n = cr_block_size;
1320 do {
1321 *((int32_t*) packed_w) = boff;
1322 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1323 } while (--n != 0);
1324 }
1325 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1326 for (size_t x = 0; x < w; x++) {
1327 for (size_t y = 0; y < h; y++) {
1328 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1329 const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
Marat Dukhanb42f8662020-07-06 20:46:13 -07001330 packed_b[cr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001331 *((uint8_t*) packed_w) = kv;
1332 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1333 }
1334 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1335 }
1336 }
Marat Dukhan82286892021-06-04 17:27:27 -07001337 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanab582382020-07-06 13:32:08 -07001338 }
1339}
1340
Marat Dukhanf62bbdc2020-08-04 13:59:04 -07001341void xnn_pack_qs8_dwconv_ghw_w(
1342 size_t h,
1343 size_t w,
1344 size_t c,
1345 size_t cr,
1346 const int8_t* k,
1347 const int32_t* b,
1348 void* packed_w,
Marat Dukhan82286892021-06-04 17:27:27 -07001349 size_t extra_bytes,
Marat Dukhanf62bbdc2020-08-04 13:59:04 -07001350 const struct xnn_qs8_packing_params* params)
1351{
1352 const int32_t izp = (int32_t) params->input_zero_point;
1353 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1354 const size_t cr_block_size = min(c - cr_block_start, cr);
1355 int32_t* packed_b = (int32_t*) packed_w;
1356 if XNN_LIKELY(b != NULL) {
1357 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1358 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
1359 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1360 }
1361 } else {
1362 size_t n = cr_block_size;
1363 do {
1364 *((int32_t*) packed_w) = 0;
1365 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1366 } while (--n != 0);
1367 }
1368 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1369 for (size_t x = 0; x < w; x++) {
1370 for (size_t y = 0; y < h; y++) {
1371 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1372 const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1373 packed_b[cr_block_offset] -= (int32_t) kv * izp;
1374 *((int8_t*) packed_w) = kv;
1375 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1376 }
1377 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1378 }
1379 }
Marat Dukhan82286892021-06-04 17:27:27 -07001380 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanf62bbdc2020-08-04 13:59:04 -07001381 }
1382}
1383
Marat Dukhana6879bd2020-07-06 14:25:08 -07001384void xnn_pack_f32_dwconv_hwg_w(
1385 size_t h,
1386 size_t w,
1387 size_t c,
1388 size_t cr,
1389 const float* k,
1390 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001391 float* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -07001392 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001393 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001394{
1395 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1396 const size_t cr_block_size = min(c - cr_block_start, cr);
1397 if XNN_LIKELY(b != NULL) {
1398 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1399 *packed_w++ = b[cr_block_start + cr_block_offset];
1400 }
1401 } else {
1402 size_t n = cr_block_size;
1403 do {
1404 *packed_w++ = 0.0f;
1405 } while (--n != 0);
1406 }
1407 packed_w += cr - cr_block_size;
1408 for (size_t x = 0; x < w; x++) {
1409 for (size_t y = 0; y < h; y++) {
1410 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1411 const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1412 *packed_w++ = kv;
1413 }
1414 packed_w += cr - cr_block_size;
1415 }
1416 }
Marat Dukhan97262462021-06-18 16:14:17 -07001417 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -07001418 }
1419}
1420
1421void xnn_pack_f16_dwconv_hwg_w(
1422 size_t h,
1423 size_t w,
1424 size_t c,
1425 size_t cr,
1426 const uint16_t* k,
1427 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001428 uint16_t* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -07001429 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001430 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001431{
1432 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1433 const size_t cr_block_size = min(c - cr_block_start, cr);
1434 if XNN_LIKELY(b != NULL) {
1435 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1436 *packed_w++ = b[cr_block_start + cr_block_offset];
1437 }
1438 } else {
1439 size_t n = cr_block_size;
1440 do {
1441 *packed_w++ = 0;
1442 } while (--n != 0);
1443 }
1444 packed_w += cr - cr_block_size;
1445 for (size_t x = 0; x < w; x++) {
1446 for (size_t y = 0; y < h; y++) {
1447 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1448 const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1449 *packed_w++ = kv;
1450 }
1451 packed_w += cr - cr_block_size;
1452 }
1453 }
Marat Dukhan97262462021-06-18 16:14:17 -07001454 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -07001455 }
1456}
1457
Marat Dukhan08b7a972020-07-14 18:17:29 -07001458void xnn_pack_qu8_dwconv_hwg_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001459 size_t h,
1460 size_t w,
1461 size_t c,
1462 size_t cr,
Marat Dukhanab582382020-07-06 13:32:08 -07001463 const uint8_t* k,
1464 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001465 void* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -07001466 size_t extra_bytes,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001467 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001468{
Marat Dukhanb42f8662020-07-06 20:46:13 -07001469 const int32_t izp = (int32_t) params->input_zero_point;
1470 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001471 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1472 const size_t cr_block_size = min(c - cr_block_start, cr);
1473 int32_t* packed_b = (int32_t*) packed_w;
1474 if XNN_LIKELY(b != NULL) {
1475 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1476 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1477 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1478 }
1479 } else {
1480 size_t n = cr_block_size;
1481 do {
1482 *((int32_t*) packed_w) = boff;
1483 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1484 } while (--n != 0);
1485 }
1486 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1487 for (size_t x = 0; x < w; x++) {
1488 for (size_t y = 0; y < h; y++) {
1489 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1490 const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
Marat Dukhanb42f8662020-07-06 20:46:13 -07001491 packed_b[cr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001492 *((uint8_t*) packed_w) = kv;
1493 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1494 }
1495 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1496 }
1497 }
Marat Dukhan97262462021-06-18 16:14:17 -07001498 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanab582382020-07-06 13:32:08 -07001499 }
1500}
1501
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001502void xnn_pack_qs8_dwconv_hwg_w(
1503 size_t h,
1504 size_t w,
1505 size_t c,
1506 size_t cr,
1507 const int8_t* k,
1508 const int32_t* b,
1509 void* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -07001510 size_t extra_bytes,
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001511 const struct xnn_qs8_packing_params* params)
1512{
1513 const int32_t izp = (int32_t) params->input_zero_point;
1514 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1515 const size_t cr_block_size = min(c - cr_block_start, cr);
1516 int32_t* packed_b = (int32_t*) packed_w;
1517 if XNN_LIKELY(b != NULL) {
1518 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1519 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
1520 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1521 }
1522 } else {
1523 size_t n = cr_block_size;
1524 do {
1525 *((int32_t*) packed_w) = 0;
1526 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1527 } while (--n != 0);
1528 }
1529 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1530 for (size_t x = 0; x < w; x++) {
1531 for (size_t y = 0; y < h; y++) {
1532 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1533 const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1534 packed_b[cr_block_offset] -= (int32_t) kv * izp;
1535 *((int8_t*) packed_w) = kv;
1536 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1537 }
1538 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1539 }
1540 }
Marat Dukhan97262462021-06-18 16:14:17 -07001541 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001542 }
1543}
1544
Marat Dukhana6879bd2020-07-06 14:25:08 -07001545void xnn_pack_f32_gemminc_goi_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001546 size_t g,
1547 size_t nc,
1548 size_t kc,
1549 size_t nr,
1550 size_t kr,
1551 size_t sr,
Marat Dukhana6879bd2020-07-06 14:25:08 -07001552 const float* k,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001553 float* packed_w,
1554 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001555{
1556 const size_t skr = sr * kr;
1557 const size_t skc = round_down_po2(kc, skr);
1558 const size_t sr_mask = (sr - 1) * kr;
1559 do {
1560 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1561 const size_t nr_block_size = min(nc - nr_block_start, nr);
Marat Dukhanab582382020-07-06 13:32:08 -07001562
1563 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1564 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1565 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1566 *packed_w++ =
1567 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1568 }
1569 }
1570 packed_w += (nr - nr_block_size) * kr;
1571 }
1572
1573 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1574 const size_t kr_block_size = min(kc - kr_block_start, kr);
1575 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1576 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1577 *packed_w++ =
1578 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1579 }
1580 packed_w += kr - kr_block_size;
1581 }
1582 packed_w += (nr - nr_block_size) * kr;
1583 }
1584 }
1585 k += nc * kc;
Marat Dukhanab582382020-07-06 13:32:08 -07001586 } while (--g != 0);
1587}
1588
Marat Dukhanab582382020-07-06 13:32:08 -07001589void xnn_pack_f16_gemminc_goi_w(
1590 size_t g,
1591 size_t nc,
1592 size_t kc,
1593 size_t nr,
1594 size_t kr,
1595 size_t sr,
1596 const uint16_t* k,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001597 uint16_t* packed_w,
1598 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001599{
1600 const size_t skr = sr * kr;
1601 const size_t skc = round_down_po2(kc, skr);
1602 const size_t sr_mask = (sr - 1) * kr;
1603 do {
1604 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1605 const size_t nr_block_size = min(nc - nr_block_start, nr);
1606
1607 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1608 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1609 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1610 *packed_w++ =
1611 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1612 }
1613 }
1614 packed_w += (nr - nr_block_size) * kr;
1615 }
1616
1617 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1618 const size_t kr_block_size = min(kc - kr_block_start, kr);
1619 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1620 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1621 *packed_w++ =
1622 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1623 }
1624 packed_w += kr - kr_block_size;
1625 }
1626 packed_w += (nr - nr_block_size) * kr;
1627 }
1628 }
1629 k += nc * kc;
1630 } while (--g != 0);
1631}
1632
Marat Dukhana6879bd2020-07-06 14:25:08 -07001633void xnn_pack_f32_dconv_oki_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001634 size_t nc,
Marat Dukhanab582382020-07-06 13:32:08 -07001635 size_t kc,
1636 size_t nr,
Marat Dukhana6879bd2020-07-06 14:25:08 -07001637 size_t kh,
1638 size_t kw,
1639 const float* k,
1640 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001641 float* packed_w,
1642 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001643{
Marat Dukhana6879bd2020-07-06 14:25:08 -07001644 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1645 const size_t nr_block_size = min(nc - nr_block_start, nr);
1646 if XNN_LIKELY(b != NULL) {
1647 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1648 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
Marat Dukhanab582382020-07-06 13:32:08 -07001649 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001650 } else {
1651 size_t n = nr;
1652 do {
1653 *packed_w++ = 0.0f;
1654 } while (--n != 0);
1655 }
Marat Dukhanab582382020-07-06 13:32:08 -07001656
Marat Dukhana6879bd2020-07-06 14:25:08 -07001657 for (size_t kx = 0; kx < kw; kx++) {
1658 for (size_t c = 0; c < kc; c++) {
1659 for (size_t ky = 0; ky < kh; ky++) {
1660 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1661 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
Marat Dukhanab582382020-07-06 13:32:08 -07001662 }
Marat Dukhanab582382020-07-06 13:32:08 -07001663 }
1664 }
1665 }
Marat Dukhanab582382020-07-06 13:32:08 -07001666 if XNN_UNPREDICTABLE(b != NULL) {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001667 b += nr;
Marat Dukhanab582382020-07-06 13:32:08 -07001668 }
1669 }
1670}
1671
1672void xnn_pack_f16_dconv_oki_w(
1673 size_t nc,
1674 size_t kc,
1675 size_t nr,
1676 size_t kh,
1677 size_t kw,
1678 const uint16_t* k,
1679 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001680 uint16_t* packed_w,
1681 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001682{
1683 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1684 const size_t nr_block_size = min(nc - nr_block_start, nr);
1685 if XNN_LIKELY(b != NULL) {
1686 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1687 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1688 }
1689 } else {
1690 size_t n = nr;
1691 do {
1692 *packed_w++ = 0;
1693 } while (--n != 0);
1694 }
1695
1696 for (size_t kx = 0; kx < kw; kx++) {
1697 for (size_t c = 0; c < kc; c++) {
1698 for (size_t ky = 0; ky < kh; ky++) {
1699 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1700 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1701 }
1702 }
1703 }
1704 }
1705 if XNN_UNPREDICTABLE(b != NULL) {
1706 b += nr;
1707 }
1708 }
1709}
1710
Marat Dukhana6879bd2020-07-06 14:25:08 -07001711void xnn_pack_f32_chw_dwconv_ghw_w(
1712 size_t kernel_size,
1713 size_t groups,
1714 const float* kernel,
1715 const float* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001716 float* packed_weights,
1717 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001718{
Marat Dukhana6879bd2020-07-06 14:25:08 -07001719 for (size_t g = 0; g < groups; g++) {
1720 if XNN_LIKELY(bias != NULL) {
1721 *packed_weights = *bias++;
Marat Dukhanab582382020-07-06 13:32:08 -07001722 } else {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001723 *packed_weights = 0.0f;
Marat Dukhanab582382020-07-06 13:32:08 -07001724 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001725 packed_weights += 1;
1726 for (size_t i = 0; i < kernel_size; i++) {
1727 *packed_weights++ = kernel[g * kernel_size + i];
Marat Dukhanab582382020-07-06 13:32:08 -07001728 }
1729 }
1730}
1731
1732void xnn_pack_f16_chw_dwconv_ghw_w(
1733 size_t kernel_size,
1734 size_t groups,
1735 const uint16_t* kernel,
1736 const uint16_t* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001737 uint16_t* packed_weights,
1738 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001739{
1740 for (size_t g = 0; g < groups; g++) {
1741 if XNN_LIKELY(bias != NULL) {
1742 *packed_weights = *bias++;
1743 } else {
1744 *packed_weights = 0;
1745 }
1746 packed_weights += 1;
1747 for (size_t i = 0; i < kernel_size; i++) {
1748 *packed_weights++ = kernel[g * kernel_size + i];
1749 }
1750 }
1751}
1752
Marat Dukhanab582382020-07-06 13:32:08 -07001753void xnn_pack_f32_chw_dwconv_hwg_w(
1754 size_t kernel_size,
1755 size_t groups,
1756 const float* kernel,
1757 const float* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001758 float* packed_weights,
1759 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001760{
1761 for (size_t g = 0; g < groups; g++) {
1762 if XNN_LIKELY(bias != NULL) {
1763 *packed_weights = *bias++;
1764 } else {
1765 *packed_weights = 0.0f;
1766 }
1767 packed_weights += 1;
1768 for (size_t i = 0; i < kernel_size; i++) {
1769 *packed_weights++ = kernel[i * groups + g];
1770 }
1771 }
1772}
1773
1774void xnn_pack_f32_vmulcaddc_w(
1775 size_t c,
1776 size_t cr,
1777 const float* s,
1778 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001779 float* packed_w,
1780 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001781{
1782 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1783 const size_t cr_block_size = min(c - cr_block_start, cr);
1784 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1785 *packed_w++ = s[cr_block_start + cr_block_offset];
1786 }
1787 packed_w += cr - cr_block_size;
1788 if XNN_LIKELY(b != NULL) {
1789 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1790 *packed_w++ = b[cr_block_start + cr_block_offset];
1791 }
1792 } else {
1793 size_t n = cr_block_size;
1794 do {
1795 *packed_w++ = 0.0f;
1796 } while (--n != 0);
1797 }
1798 packed_w += cr - cr_block_size;
1799 }
1800}
1801
1802void xnn_pack_f16_vmulcaddc_w(
1803 size_t c,
1804 size_t cr,
1805 const uint16_t* s,
1806 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001807 uint16_t* packed_w,
1808 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001809{
1810 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1811 const size_t cr_block_size = min(c - cr_block_start, cr);
1812 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1813 *packed_w++ = s[cr_block_start + cr_block_offset];
1814 }
1815 packed_w += cr - cr_block_size;
1816 if XNN_LIKELY(b != NULL) {
1817 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1818 *packed_w++ = b[cr_block_start + cr_block_offset];
1819 }
1820 } else {
1821 size_t n = cr_block_size;
1822 do {
1823 *packed_w++ = 0;
1824 } while (--n != 0);
1825 }
1826 packed_w += cr - cr_block_size;
1827 }
1828}