blob: 1d8e9d4cdb039046c8ea51153407c708ab0bd826 [file] [log] [blame]
Marat Dukhanab582382020-07-06 13:32:08 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <stdint.h>
10#include <stddef.h>
11
12#include <xnnpack/math.h>
13#include <xnnpack/pack.h>
14
15
Marat Dukhana6879bd2020-07-06 14:25:08 -070016void xnn_pack_f32_gemm_goi_w(
17 size_t g,
18 size_t nc,
19 size_t kc,
20 size_t nr,
21 size_t kr,
22 size_t sr,
23 const float* k,
24 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -070025 float* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -070026 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -070027 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -070028{
29 const size_t skr = sr * kr;
30 const size_t skc = round_down_po2(kc, skr);
31 const size_t sr_mask = (sr - 1) * kr;
32 do {
33 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
34 const size_t nr_block_size = min(nc - nr_block_start, nr);
35 if XNN_LIKELY(b != NULL) {
36 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
37 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
38 }
39 }
40 packed_w += nr;
41
42 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
43 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
44 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
45 *packed_w++ =
46 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
47 }
48 }
49 packed_w += (nr - nr_block_size) * kr;
50 }
51
52 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
53 const size_t kr_block_size = min(kc - kr_block_start, kr);
54 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
55 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
56 *packed_w++ =
57 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
58 }
59 packed_w += kr - kr_block_size;
60 }
61 packed_w += (nr - nr_block_size) * kr;
62 }
Marat Dukhane06c8132021-06-03 08:59:11 -070063 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -070064 }
65 k += nc * kc;
66 if XNN_UNPREDICTABLE(b != NULL) {
67 b += nc;
68 }
69 } while (--g != 0);
70}
71
72void xnn_pack_f16_gemm_goi_w(
73 size_t g,
74 size_t nc,
75 size_t kc,
76 size_t nr,
77 size_t kr,
78 size_t sr,
79 const uint16_t* k,
80 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -070081 uint16_t* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -070082 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -070083 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -070084{
85 const size_t skr = sr * kr;
86 const size_t skc = round_down_po2(kc, skr);
87 const size_t sr_mask = (sr - 1) * kr;
88 do {
89 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
90 const size_t nr_block_size = min(nc - nr_block_start, nr);
91 if XNN_LIKELY(b != NULL) {
92 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
93 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
94 }
95 }
96 packed_w += nr;
97
98 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
99 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
100 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
101 *packed_w++ =
102 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
103 }
104 }
105 packed_w += (nr - nr_block_size) * kr;
106 }
107
108 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
109 const size_t kr_block_size = min(kc - kr_block_start, kr);
110 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
111 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
112 *packed_w++ =
113 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
114 }
115 packed_w += kr - kr_block_size;
116 }
117 packed_w += (nr - nr_block_size) * kr;
118 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700119 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -0700120 }
121 k += nc * kc;
122 if XNN_UNPREDICTABLE(b != NULL) {
123 b += nc;
124 }
125 } while (--g != 0);
126}
127
Marat Dukhan08b7a972020-07-14 18:17:29 -0700128void xnn_pack_qu8_gemm_goi_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700129 size_t g,
130 size_t nc,
131 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700132 size_t nr,
133 size_t kr,
Marat Dukhanbc08f312020-07-07 16:22:04 -0700134 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700135 const uint8_t* k,
136 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700137 void* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -0700138 size_t extra_bytes,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700139 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700140{
Marat Dukhanbc08f312020-07-07 16:22:04 -0700141 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700142 const int32_t izp = (int32_t) params->input_zero_point;
143 const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700144 do {
145 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
146 const size_t nr_block_size = min(nc - nr_block_start, nr);
147 int32_t* packed_b = (int32_t*) packed_w;
148 if XNN_LIKELY(b != NULL) {
149 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
150 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
151 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
152 }
153 } else {
154 size_t n = nr_block_size;
155 do {
156 *((int32_t*) packed_w) = boff;
157 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
158 } while (--n != 0);
159 }
160 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
161 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
162 const size_t kr_block_size = min(kc - kr_block_start, kr);
163 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
164 int32_t ksum = 0;
165 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
166 const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
167 ksum += (int32_t) kv;
168 *((uint8_t*) packed_w) = kv;
169 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
170 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700171 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700172 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
173 }
174 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
175 }
Marat Dukhan0b043742021-06-02 18:29:11 -0700176 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanab582382020-07-06 13:32:08 -0700177 }
178 k += nc * kc;
179 if XNN_UNPREDICTABLE(b != NULL) {
180 b += nc;
181 }
182 } while (--g != 0);
183}
184
Marat Dukhan595e1702020-07-31 10:12:52 -0700185void xnn_pack_qs8_gemm_goi_w(
186 size_t g,
187 size_t nc,
188 size_t kc,
189 size_t nr,
190 size_t kr,
191 size_t sr,
192 const int8_t* k,
193 const int32_t* b,
194 void* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -0700195 size_t extra_bytes,
Marat Dukhan595e1702020-07-31 10:12:52 -0700196 const struct xnn_qs8_packing_params* params)
197{
198 assert(sr == 1);
199 const int32_t izp = (int32_t) params->input_zero_point;
200 do {
201 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
202 const size_t nr_block_size = min(nc - nr_block_start, nr);
203 int32_t* packed_b = (int32_t*) packed_w;
204 if XNN_LIKELY(b != NULL) {
205 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
206 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
207 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
208 }
209 } else {
210 size_t n = nr_block_size;
211 do {
212 *((int32_t*) packed_w) = 0;
213 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
214 } while (--n != 0);
215 }
216 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
217 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
218 const size_t kr_block_size = min(kc - kr_block_start, kr);
219 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
220 int32_t ksum = 0;
221 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
222 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
223 ksum += (int32_t) kv;
224 *((int8_t*) packed_w) = kv;
225 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
226 }
227 packed_b[nr_block_offset] -= ksum * izp;
228 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
229 }
230 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
231 }
Marat Dukhan0b043742021-06-02 18:29:11 -0700232 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhan595e1702020-07-31 10:12:52 -0700233 }
234 k += nc * kc;
235 if XNN_UNPREDICTABLE(b != NULL) {
236 b += nc;
237 }
238 } while (--g != 0);
239}
240
Marat Dukhan683fab32020-08-03 19:42:52 -0700241void xnn_pack_qs8_gemm_xw_goi_w(
242 size_t g,
243 size_t nc,
244 size_t kc,
245 size_t nr,
246 size_t kr,
247 size_t sr,
248 const int8_t* k,
249 const int32_t* b,
250 void* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -0700251 size_t extra_bytes,
Marat Dukhan683fab32020-08-03 19:42:52 -0700252 const struct xnn_qs8_packing_params* params)
253{
254 assert(sr == 1);
255 const int32_t izp = (int32_t) params->input_zero_point;
256 do {
257 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
258 const size_t nr_block_size = min(nc - nr_block_start, nr);
259 int32_t* packed_b = (int32_t*) packed_w;
260 if XNN_LIKELY(b != NULL) {
261 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
262 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
263 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
264 }
265 } else {
266 size_t n = nr_block_size;
267 do {
268 *((int32_t*) packed_w) = 0;
269 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
270 } while (--n != 0);
271 }
272 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
273 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
274 const size_t kr_block_size = min(kc - kr_block_start, kr);
275 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
276 int32_t ksum = 0;
277 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
278 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
279 ksum += (int32_t) kv;
280 *((int16_t*) packed_w) = (int16_t) kv;
281 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int16_t));
282 }
283 packed_b[nr_block_offset] -= ksum * izp;
284 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int16_t));
285 }
286 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int16_t));
287 }
Marat Dukhan0b043742021-06-02 18:29:11 -0700288 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhan683fab32020-08-03 19:42:52 -0700289 }
290 k += nc * kc;
291 if XNN_UNPREDICTABLE(b != NULL) {
292 b += nc;
293 }
294 } while (--g != 0);
295}
296
Marat Dukhana6879bd2020-07-06 14:25:08 -0700297void xnn_pack_f32_gemm_io_w(
298 size_t nc,
299 size_t kc,
300 size_t nr,
301 size_t kr,
302 size_t sr,
303 const float* k,
304 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700305 float* packed_w,
306 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700307{
308 const size_t skr = sr * kr;
309 const size_t skc = round_down_po2(kc, skr);
310 const size_t sr_mask = (sr - 1) * kr;
311 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
312 const size_t nr_block_size = min(nc - nr_block_start, nr);
313 if XNN_LIKELY(b != NULL) {
314 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
315 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
316 }
317 }
318 packed_w += nr;
319
320 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
321 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
322 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
323 *packed_w++ =
324 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
325 }
326 }
327 packed_w += (nr - nr_block_size) * kr;
328 }
329
330 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
331 const size_t kr_block_size = min(kc - kr_block_start, kr);
332 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
333 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
334 *packed_w++ =
335 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
336 }
337 packed_w += kr - kr_block_size;
338 }
339 packed_w += (nr - nr_block_size) * kr;
340 }
341 }
342}
343
344void xnn_pack_f16_gemm_io_w(
345 size_t nc,
346 size_t kc,
347 size_t nr,
348 size_t kr,
349 size_t sr,
350 const uint16_t* k,
351 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700352 uint16_t* packed_w,
353 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700354{
355 const size_t skr = sr * kr;
356 const size_t skc = round_down_po2(kc, skr);
357 const size_t sr_mask = (sr - 1) * kr;
358 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
359 const size_t nr_block_size = min(nc - nr_block_start, nr);
360 if XNN_LIKELY(b != NULL) {
361 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
362 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
363 }
364 }
365 packed_w += nr;
366
367 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
368 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
369 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
370 *packed_w++ =
371 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
372 }
373 }
374 packed_w += (nr - nr_block_size) * kr;
375 }
376
377 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
378 const size_t kr_block_size = min(kc - kr_block_start, kr);
379 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
380 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
381 *packed_w++ =
382 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
383 }
384 packed_w += kr - kr_block_size;
385 }
386 packed_w += (nr - nr_block_size) * kr;
387 }
388 }
389}
390
Marat Dukhan08b7a972020-07-14 18:17:29 -0700391void xnn_pack_qu8_gemm_io_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700392 size_t nc,
393 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700394 size_t nr,
395 size_t kr,
Marat Dukhanbc08f312020-07-07 16:22:04 -0700396 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700397 const uint8_t* k,
398 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700399 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700400 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700401{
Marat Dukhanbc08f312020-07-07 16:22:04 -0700402 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700403 const int32_t izp = (int32_t) params->input_zero_point;
404 const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700405 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
406 const size_t nr_block_size = min(nc - nr_block_start, nr);
407 int32_t* packed_b = (int32_t*) packed_w;
408 if XNN_LIKELY(b != NULL) {
409 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
410 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
411 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
412 }
413 } else {
414 size_t n = nr_block_size;
415 do {
416 *((int32_t*) packed_w) = boff;
417 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
418 } while (--n != 0);
419 }
420 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
421 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
422 const size_t kr_block_size = min(kc - kr_block_start, kr);
423 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
424 int32_t ksum = 0;
425 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
426 const uint8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
427 ksum += (int32_t) kv;
428 *((uint8_t*) packed_w) = kv;
429 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
430 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700431 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700432 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
433 }
434 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
435 }
436 }
437}
438
Marat Dukhand23cb6e2021-04-01 01:18:58 -0700439void xnn_pack_qs8_gemm_io_w(
440 size_t nc,
441 size_t kc,
442 size_t nr,
443 size_t kr,
444 size_t sr,
445 const int8_t* k,
446 const int32_t* b,
447 void* packed_w,
448 const struct xnn_qs8_packing_params* params)
449{
450 assert(sr == 1);
451 const int32_t izp = (int32_t) params->input_zero_point;
452 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
453 const size_t nr_block_size = min(nc - nr_block_start, nr);
454 int32_t* packed_b = (int32_t*) packed_w;
455 if XNN_LIKELY(b != NULL) {
456 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
457 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
458 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
459 }
460 } else {
461 size_t n = nr_block_size;
462 do {
463 *((int32_t*) packed_w) = 0;
464 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
465 } while (--n != 0);
466 }
467 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
468 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
469 const size_t kr_block_size = min(kc - kr_block_start, kr);
470 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
471 int32_t ksum = 0;
472 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
473 const int8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
474 ksum += (int32_t) kv;
475 *((int8_t*) packed_w) = kv;
476 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
477 }
478 packed_b[nr_block_offset] -= ksum * izp;
479 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
480 }
481 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
482 }
483 }
484}
485
Marat Dukhana6879bd2020-07-06 14:25:08 -0700486void xnn_pack_f32_conv_goki_w(
487 size_t g,
488 size_t nc,
489 size_t ks,
490 size_t kc,
491 size_t nr,
492 size_t kr,
493 size_t sr,
494 const float* k,
495 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700496 float* packed_w,
Marat Dukhane06c8132021-06-03 08:59:11 -0700497 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700498 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700499{
500 const size_t skr = sr * kr;
501 const size_t skc = round_down_po2(kc, skr);
502 const size_t sr_mask = (sr - 1) * kr;
503 do {
504 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
505 const size_t nr_block_size = min(nc - nr_block_start, nr);
506 if XNN_LIKELY(b != NULL) {
507 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
508 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
509 }
510 }
511 packed_w += nr;
512
513 for (size_t ki = 0; ki < ks; ki++) {
514 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
515 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
516 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
517 *packed_w++ =
518 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
519 }
520 }
521 packed_w += (nr - nr_block_size) * kr;
522 }
523
524 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
525 const size_t kr_block_size = min(kc - kr_block_start, kr);
526 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
527 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
528 *packed_w++ =
529 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
530 }
531 packed_w += kr - kr_block_size;
532 }
533 packed_w += (nr - nr_block_size) * kr;
534 }
535 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700536 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -0700537 }
538 k += ks * kc * nc;
539 if XNN_UNPREDICTABLE(b != NULL) {
540 b += nc;
541 }
542 } while (--g != 0);
543}
544
545void xnn_pack_f16_conv_goki_w(
546 size_t g,
547 size_t nc,
548 size_t ks,
549 size_t kc,
550 size_t nr,
551 size_t kr,
552 size_t sr,
553 const uint16_t* k,
554 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700555 uint16_t* packed_w,
Marat Dukhane06c8132021-06-03 08:59:11 -0700556 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700557 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700558{
559 const size_t skr = sr * kr;
560 const size_t skc = round_down_po2(kc, skr);
561 const size_t sr_mask = (sr - 1) * kr;
562 do {
563 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
564 const size_t nr_block_size = min(nc - nr_block_start, nr);
565 if XNN_LIKELY(b != NULL) {
566 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
567 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
568 }
569 }
570 packed_w += nr;
571
572 for (size_t ki = 0; ki < ks; ki++) {
573 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
574 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
575 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
576 *packed_w++ =
577 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
578 }
579 }
580 packed_w += (nr - nr_block_size) * kr;
581 }
582
583 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
584 const size_t kr_block_size = min(kc - kr_block_start, kr);
585 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
586 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
587 *packed_w++ =
588 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
589 }
590 packed_w += kr - kr_block_size;
591 }
592 packed_w += (nr - nr_block_size) * kr;
593 }
594 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700595 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -0700596 }
597 k += ks * kc * nc;
598 if XNN_UNPREDICTABLE(b != NULL) {
599 b += nc;
600 }
601 } while (--g != 0);
602}
603
Marat Dukhan08b7a972020-07-14 18:17:29 -0700604void xnn_pack_qu8_conv_goki_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700605 size_t g,
606 size_t nc,
607 size_t ks,
608 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700609 size_t nr,
610 size_t kr,
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700611 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700612 const uint8_t* k,
613 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700614 void* packed_w,
Marat Dukhane06c8132021-06-03 08:59:11 -0700615 size_t extra_bytes,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700616 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700617{
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700618 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700619 const int32_t izp = (int32_t) params->input_zero_point;
620 const int32_t boff = (int32_t) ks * (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700621 do {
622 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
623 const size_t nr_block_size = min(nc - nr_block_start, nr);
624 int32_t* packed_b = (int32_t*) packed_w;
625 if XNN_LIKELY(b != NULL) {
626 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
627 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
628 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
629 }
630 } else {
631 size_t n = nr_block_size;
632 do {
633 *((int32_t*) packed_w) = boff;
634 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
635 } while (--n != 0);
636 }
637 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
638 for (size_t ki = 0; ki < ks; ki++) {
639 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
640 const size_t kr_block_size = min(kc - kr_block_start, kr);
641 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
642 int32_t ksum = 0;
643 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
644 const uint8_t kv =
645 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
646 ksum += (int32_t) kv;
647 *((uint8_t*) packed_w) = kv;
648 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
649 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700650 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700651 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
652 }
653 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
654 }
655 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700656 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanab582382020-07-06 13:32:08 -0700657 }
658 k += ks * kc * nc;
659 if XNN_UNPREDICTABLE(b != NULL) {
660 b += nc;
661 }
662 } while (--g != 0);
663}
664
Marat Dukhanf9480682020-07-31 14:50:24 -0700665void xnn_pack_qs8_conv_goki_w(
666 size_t g,
667 size_t nc,
668 size_t ks,
669 size_t kc,
670 size_t nr,
671 size_t kr,
672 size_t sr,
673 const int8_t* k,
674 const int32_t* b,
675 void* packed_w,
Marat Dukhane06c8132021-06-03 08:59:11 -0700676 size_t extra_bytes,
Marat Dukhanf9480682020-07-31 14:50:24 -0700677 const struct xnn_qs8_packing_params* params)
678{
679 assert(sr == 1);
680 const int32_t izp = (int32_t) params->input_zero_point;
681 do {
682 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
683 const size_t nr_block_size = min(nc - nr_block_start, nr);
684 int32_t* packed_b = (int32_t*) packed_w;
685 if XNN_LIKELY(b != NULL) {
686 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
687 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
688 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
689 }
690 } else {
691 size_t n = nr_block_size;
692 do {
693 *((int32_t*) packed_w) = 0;
694 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
695 } while (--n != 0);
696 }
697 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
698 for (size_t ki = 0; ki < ks; ki++) {
699 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
700 const size_t kr_block_size = min(kc - kr_block_start, kr);
701 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
702 int32_t ksum = 0;
703 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
704 const int8_t kv =
705 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
706 ksum += (int32_t) kv;
707 *((int8_t*) packed_w) = kv;
708 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
709 }
710 packed_b[nr_block_offset] -= ksum * izp;
711 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
712 }
713 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
714 }
715 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700716 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanf9480682020-07-31 14:50:24 -0700717 }
718 k += ks * kc * nc;
719 if XNN_UNPREDICTABLE(b != NULL) {
720 b += nc;
721 }
722 } while (--g != 0);
723}
724
Marat Dukhana6879bd2020-07-06 14:25:08 -0700725void xnn_pack_f32_conv_kgo_w(
726 size_t g,
727 size_t nc,
728 size_t ks,
729 size_t nr,
730 size_t kr,
731 const float* k,
732 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700733 float* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -0700734 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700735 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700736{
737 for (size_t i = 0; i < g; i++) {
738 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
739 const size_t nr_block_size = min(nc - nr_block_start, nr);
740 if XNN_LIKELY(b != NULL) {
741 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
742 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
743 }
744 }
745 packed_w += nr;
746 for (size_t ki = 0; ki < ks; ki++) {
747 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
748 *packed_w =
749 k[ki * g * nc + (nr_block_start + nr_block_offset)];
750 packed_w += kr;
751 }
752 packed_w += (nr - nr_block_size) * kr;
753 }
Marat Dukhan97262462021-06-18 16:14:17 -0700754 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -0700755 }
756 k += nc;
757 if XNN_UNPREDICTABLE(b != NULL) {
758 b += nc;
759 }
760 }
761}
762
763void xnn_pack_f16_conv_kgo_w(
764 size_t g,
765 size_t nc,
766 size_t ks,
767 size_t nr,
768 size_t kr,
769 const uint16_t* k,
770 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700771 uint16_t* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -0700772 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700773 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700774{
775 for (size_t i = 0; i < g; i++) {
776 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
777 const size_t nr_block_size = min(nc - nr_block_start, nr);
778 if XNN_LIKELY(b != NULL) {
779 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
780 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
781 }
782 }
783 packed_w += nr;
784 for (size_t ki = 0; ki < ks; ki++) {
785 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
786 *packed_w =
787 k[ki * g * nc + (nr_block_start + nr_block_offset)];
788 packed_w += kr;
789 }
790 packed_w += (nr - nr_block_size) * kr;
791 }
Marat Dukhan97262462021-06-18 16:14:17 -0700792 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -0700793 }
794 k += nc;
795 if XNN_UNPREDICTABLE(b != NULL) {
796 b += nc;
797 }
798 }
799}
800
Marat Dukhan08b7a972020-07-14 18:17:29 -0700801void xnn_pack_qu8_conv_kgo_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700802 size_t g,
803 size_t nc,
804 size_t ks,
Marat Dukhana6879bd2020-07-06 14:25:08 -0700805 size_t nr,
806 size_t kr,
Marat Dukhanab582382020-07-06 13:32:08 -0700807 const uint8_t* k,
808 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700809 void* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -0700810 size_t extra_bytes,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700811 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700812{
Marat Dukhanb42f8662020-07-06 20:46:13 -0700813 const int32_t izp = (int32_t) params->input_zero_point;
814 const int32_t boff = (int32_t) ks * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700815 for (size_t i = 0; i < g; i++) {
816 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
817 const size_t nr_block_size = min(nc - nr_block_start, nr);
818 int32_t* packed_b = (int32_t*) packed_w;
819 if XNN_LIKELY(b != NULL) {
820 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
821 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
822 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
823 }
824 } else {
825 size_t n = nr_block_size;
826 do {
827 *((int32_t*) packed_w) = boff;
828 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
829 } while (--n != 0);
830 }
831 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
832 for (size_t ki = 0; ki < ks; ki++) {
833 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
834 const uint8_t kv =
835 k[ki * g * nc + (nr_block_start + nr_block_offset)];
836 *((uint8_t*) packed_w) = kv;
Marat Dukhanb42f8662020-07-06 20:46:13 -0700837 packed_b[nr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700838 packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(uint8_t));
839 }
840 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
841 }
Marat Dukhan97262462021-06-18 16:14:17 -0700842 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanab582382020-07-06 13:32:08 -0700843 }
844 k += nc;
845 if XNN_UNPREDICTABLE(b != NULL) {
846 b += nc;
847 }
848 }
849}
850
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700851void xnn_pack_qs8_conv_kgo_w(
852 size_t g,
853 size_t nc,
854 size_t ks,
855 size_t nr,
856 size_t kr,
857 const int8_t* k,
858 const int32_t* b,
859 void* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -0700860 size_t extra_bytes,
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700861 const struct xnn_qs8_packing_params* params)
862{
863 const int32_t izp = (int32_t) params->input_zero_point;
864 for (size_t i = 0; i < g; i++) {
865 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
866 const size_t nr_block_size = min(nc - nr_block_start, nr);
867 int32_t* packed_b = (int32_t*) packed_w;
868 if XNN_LIKELY(b != NULL) {
869 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
870 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
871 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
872 }
873 } else {
874 size_t n = nr_block_size;
875 do {
876 *((int32_t*) packed_w) = 0;
877 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
878 } while (--n != 0);
879 }
880 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
881 for (size_t ki = 0; ki < ks; ki++) {
882 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
883 const int8_t kv =
884 k[ki * g * nc + (nr_block_start + nr_block_offset)];
885 *((int8_t*) packed_w) = kv;
886 packed_b[nr_block_offset] -= (int32_t) kv * izp;
887 packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(int8_t));
888 }
889 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
890 }
Marat Dukhan97262462021-06-18 16:14:17 -0700891 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700892 }
893 k += nc;
894 if XNN_UNPREDICTABLE(b != NULL) {
895 b += nc;
896 }
897 }
898}
899
Marat Dukhana6879bd2020-07-06 14:25:08 -0700900void xnn_pack_f32_deconv_goki_w(
901 size_t g,
902 size_t nc,
903 size_t kh,
904 size_t kw,
905 size_t kc,
906 size_t sh,
907 size_t sw,
908 size_t nr,
909 size_t kr,
910 size_t sr,
911 const float* k,
912 const float* b,
913 float* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700914 struct subconvolution_params* subconv_params,
915 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700916{
917 const size_t skr = sr * kr;
918 const size_t skc = round_down_po2(kc, skr);
919 const size_t sr_mask = (sr - 1) * kr;
920 for (size_t i = 0; i < g; i++) {
921 for (size_t oy = 0; oy < sh; oy++) {
922 for (size_t ox = 0; ox < sw; ox++) {
923 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -0700924 (*subconv_params++).weights = packed_w;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700925 }
926 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
927 const size_t nr_block_size = min(nc - nr_block_start, nr);
928 if XNN_LIKELY(b != NULL) {
929 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
930 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
931 }
932 }
933 packed_w += nr;
934 for (size_t ky = oy; ky < kh; ky += sh) {
935 for (size_t kx = ox; kx < kw; kx += sw) {
936 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
937 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
938 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
939 *packed_w++ =
940 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
941 }
942 }
943 packed_w += (nr - nr_block_size) * kr;
944 }
945
946 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
947 const size_t kr_block_size = min(kc - kr_block_start, kr);
948 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
949 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
950 *packed_w++ =
951 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
952 }
953 packed_w += kr - kr_block_size;
954 }
955 packed_w += (nr - nr_block_size) * kr;
956 }
957 }
958 }
959 }
960 }
961 }
962 k += kh * kw * kc * nc;
963 if XNN_UNPREDICTABLE(b != NULL) {
964 b += nc;
965 }
966 }
967}
968
969void xnn_pack_f16_deconv_goki_w(
970 size_t g,
971 size_t nc,
972 size_t kh,
973 size_t kw,
974 size_t kc,
975 size_t sh,
976 size_t sw,
977 size_t nr,
978 size_t kr,
979 size_t sr,
980 const uint16_t* k,
981 const uint16_t* b,
982 uint16_t* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700983 struct subconvolution_params* subconv_params,
984 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700985{
986 const size_t skr = sr * kr;
987 const size_t skc = round_down_po2(kc, skr);
988 const size_t sr_mask = (sr - 1) * kr;
989 for (size_t i = 0; i < g; i++) {
990 for (size_t oy = 0; oy < sh; oy++) {
991 for (size_t ox = 0; ox < sw; ox++) {
992 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -0700993 (*subconv_params++).weights = packed_w;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700994 }
995 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
996 const size_t nr_block_size = min(nc - nr_block_start, nr);
997 if XNN_LIKELY(b != NULL) {
998 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
999 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
1000 }
1001 }
1002 packed_w += nr;
1003 for (size_t ky = oy; ky < kh; ky += sh) {
1004 for (size_t kx = ox; kx < kw; kx += sw) {
1005 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1006 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1007 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1008 *packed_w++ =
1009 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1010 }
1011 }
1012 packed_w += (nr - nr_block_size) * kr;
1013 }
1014
1015 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1016 const size_t kr_block_size = min(kc - kr_block_start, kr);
1017 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1018 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1019 *packed_w++ =
1020 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
1021 }
1022 packed_w += kr - kr_block_size;
1023 }
1024 packed_w += (nr - nr_block_size) * kr;
1025 }
1026 }
1027 }
1028 }
1029 }
1030 }
1031 k += kh * kw * kc * nc;
1032 if XNN_UNPREDICTABLE(b != NULL) {
1033 b += nc;
1034 }
1035 }
1036}
1037
Marat Dukhan08b7a972020-07-14 18:17:29 -07001038void xnn_pack_qu8_deconv_goki_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001039 size_t g,
1040 size_t nc,
1041 size_t kh,
1042 size_t kw,
1043 size_t kc,
1044 size_t sh,
1045 size_t sw,
1046 size_t nr,
1047 size_t kr,
Marat Dukhan5a698bb2020-07-07 20:47:55 -07001048 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -07001049 const uint8_t* k,
1050 const int32_t* b,
1051 void* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001052 struct subconvolution_params* subconv_params,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001053 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001054{
Marat Dukhan5a698bb2020-07-07 20:47:55 -07001055 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -07001056 const int32_t izp = (int32_t) params->input_zero_point;
1057 const int32_t kzp = (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001058 for (size_t i = 0; i < g; i++) {
1059 for (size_t oy = 0; oy < sh; oy++) {
1060 for (size_t ox = 0; ox < sw; ox++) {
1061 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -07001062 (*subconv_params++).weights = packed_w;
Marat Dukhanab582382020-07-06 13:32:08 -07001063 }
Marat Dukhanb42f8662020-07-06 20:46:13 -07001064 const int32_t boff = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * izp * kzp;
Marat Dukhanab582382020-07-06 13:32:08 -07001065 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1066 const size_t nr_block_size = min(nc - nr_block_start, nr);
1067 int32_t* packed_b = (int32_t*) packed_w;
1068 if XNN_LIKELY(b != 0) {
1069 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1070 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
1071 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1072 }
1073 } else {
1074 size_t n = nr_block_size;
1075 do {
1076 *((int32_t*) packed_w) = boff;
1077 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1078 } while (--n != 0);
1079 }
1080 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1081 for (size_t ky = oy; ky < kh; ky += sh) {
1082 for (size_t kx = ox; kx < kw; kx += sw) {
1083 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
1084 const size_t kr_block_size = min(kc - kr_block_start, kr);
1085 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1086 int32_t ksum = 0;
1087 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1088 const uint8_t kv =
1089 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
1090 ksum += (int32_t) kv;
1091 *((uint8_t*) packed_w) = kv;
1092 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1093 }
Marat Dukhanb42f8662020-07-06 20:46:13 -07001094 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001095 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
1096 }
1097 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
1098 }
1099 }
1100 }
1101 }
1102 }
1103 }
1104 k += kh * kw * kc * nc;
1105 if XNN_UNPREDICTABLE(b != NULL) {
1106 b += nc;
1107 }
1108 }
1109}
1110
Marat Dukhana6879bd2020-07-06 14:25:08 -07001111void xnn_pack_f32_dwconv_ghw_w(
1112 size_t h,
1113 size_t w,
1114 size_t c,
1115 size_t cr,
1116 const float* k,
1117 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001118 float* packed_w,
Marat Dukhan82286892021-06-04 17:27:27 -07001119 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001120 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001121{
1122 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1123 const size_t cr_block_size = min(c - cr_block_start, cr);
1124 if XNN_LIKELY(b != NULL) {
1125 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1126 *packed_w++ = b[cr_block_start + cr_block_offset];
1127 }
1128 } else {
1129 size_t n = cr_block_size;
1130 do {
1131 *packed_w++ = 0.0f;
1132 } while (--n != 0);
1133 }
1134 packed_w += cr - cr_block_size;
1135 for (size_t x = 0; x < w; x++) {
1136 for (size_t y = 0; y < h; y++) {
1137 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1138 const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1139 *packed_w++ = kv;
1140 }
1141 packed_w += cr - cr_block_size;
1142 }
1143 }
Marat Dukhan82286892021-06-04 17:27:27 -07001144 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -07001145 }
1146}
1147
1148void xnn_pack_f16_dwconv_ghw_w(
1149 size_t h,
1150 size_t w,
1151 size_t c,
1152 size_t cr,
1153 const uint16_t* k,
1154 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001155 uint16_t* packed_w,
Marat Dukhan82286892021-06-04 17:27:27 -07001156 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001157 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001158{
1159 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1160 const size_t cr_block_size = min(c - cr_block_start, cr);
1161 if XNN_LIKELY(b != NULL) {
1162 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1163 *packed_w++ = b[cr_block_start + cr_block_offset];
1164 }
1165 } else {
1166 size_t n = cr_block_size;
1167 do {
1168 *packed_w++ = 0;
1169 } while (--n != 0);
1170 }
1171 packed_w += cr - cr_block_size;
1172 for (size_t x = 0; x < w; x++) {
1173 for (size_t y = 0; y < h; y++) {
1174 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1175 const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1176 *packed_w++ = kv;
1177 }
1178 packed_w += cr - cr_block_size;
1179 }
1180 }
Marat Dukhan82286892021-06-04 17:27:27 -07001181 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -07001182 }
1183}
1184
Marat Dukhan08b7a972020-07-14 18:17:29 -07001185void xnn_pack_qu8_dwconv_ghw_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001186 size_t h,
1187 size_t w,
1188 size_t c,
1189 size_t cr,
Marat Dukhanab582382020-07-06 13:32:08 -07001190 const uint8_t* k,
1191 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001192 void* packed_w,
Marat Dukhan82286892021-06-04 17:27:27 -07001193 size_t extra_bytes,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001194 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001195{
Marat Dukhanb42f8662020-07-06 20:46:13 -07001196 const int32_t izp = (int32_t) params->input_zero_point;
1197 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001198 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1199 const size_t cr_block_size = min(c - cr_block_start, cr);
1200 int32_t* packed_b = (int32_t*) packed_w;
1201 if XNN_LIKELY(b != NULL) {
1202 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1203 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1204 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1205 }
1206 } else {
1207 size_t n = cr_block_size;
1208 do {
1209 *((int32_t*) packed_w) = boff;
1210 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1211 } while (--n != 0);
1212 }
1213 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1214 for (size_t x = 0; x < w; x++) {
1215 for (size_t y = 0; y < h; y++) {
1216 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1217 const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
Marat Dukhanb42f8662020-07-06 20:46:13 -07001218 packed_b[cr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001219 *((uint8_t*) packed_w) = kv;
1220 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1221 }
1222 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1223 }
1224 }
Marat Dukhan82286892021-06-04 17:27:27 -07001225 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanab582382020-07-06 13:32:08 -07001226 }
1227}
1228
Marat Dukhanf62bbdc2020-08-04 13:59:04 -07001229void xnn_pack_qs8_dwconv_ghw_w(
1230 size_t h,
1231 size_t w,
1232 size_t c,
1233 size_t cr,
1234 const int8_t* k,
1235 const int32_t* b,
1236 void* packed_w,
Marat Dukhan82286892021-06-04 17:27:27 -07001237 size_t extra_bytes,
Marat Dukhanf62bbdc2020-08-04 13:59:04 -07001238 const struct xnn_qs8_packing_params* params)
1239{
1240 const int32_t izp = (int32_t) params->input_zero_point;
1241 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1242 const size_t cr_block_size = min(c - cr_block_start, cr);
1243 int32_t* packed_b = (int32_t*) packed_w;
1244 if XNN_LIKELY(b != NULL) {
1245 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1246 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
1247 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1248 }
1249 } else {
1250 size_t n = cr_block_size;
1251 do {
1252 *((int32_t*) packed_w) = 0;
1253 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1254 } while (--n != 0);
1255 }
1256 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1257 for (size_t x = 0; x < w; x++) {
1258 for (size_t y = 0; y < h; y++) {
1259 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1260 const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1261 packed_b[cr_block_offset] -= (int32_t) kv * izp;
1262 *((int8_t*) packed_w) = kv;
1263 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1264 }
1265 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1266 }
1267 }
Marat Dukhan82286892021-06-04 17:27:27 -07001268 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanf62bbdc2020-08-04 13:59:04 -07001269 }
1270}
1271
Marat Dukhana6879bd2020-07-06 14:25:08 -07001272void xnn_pack_f32_dwconv_hwg_w(
1273 size_t h,
1274 size_t w,
1275 size_t c,
1276 size_t cr,
1277 const float* k,
1278 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001279 float* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -07001280 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001281 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001282{
1283 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1284 const size_t cr_block_size = min(c - cr_block_start, cr);
1285 if XNN_LIKELY(b != NULL) {
1286 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1287 *packed_w++ = b[cr_block_start + cr_block_offset];
1288 }
1289 } else {
1290 size_t n = cr_block_size;
1291 do {
1292 *packed_w++ = 0.0f;
1293 } while (--n != 0);
1294 }
1295 packed_w += cr - cr_block_size;
1296 for (size_t x = 0; x < w; x++) {
1297 for (size_t y = 0; y < h; y++) {
1298 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1299 const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1300 *packed_w++ = kv;
1301 }
1302 packed_w += cr - cr_block_size;
1303 }
1304 }
Marat Dukhan97262462021-06-18 16:14:17 -07001305 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -07001306 }
1307}
1308
1309void xnn_pack_f16_dwconv_hwg_w(
1310 size_t h,
1311 size_t w,
1312 size_t c,
1313 size_t cr,
1314 const uint16_t* k,
1315 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001316 uint16_t* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -07001317 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001318 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001319{
1320 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1321 const size_t cr_block_size = min(c - cr_block_start, cr);
1322 if XNN_LIKELY(b != NULL) {
1323 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1324 *packed_w++ = b[cr_block_start + cr_block_offset];
1325 }
1326 } else {
1327 size_t n = cr_block_size;
1328 do {
1329 *packed_w++ = 0;
1330 } while (--n != 0);
1331 }
1332 packed_w += cr - cr_block_size;
1333 for (size_t x = 0; x < w; x++) {
1334 for (size_t y = 0; y < h; y++) {
1335 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1336 const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1337 *packed_w++ = kv;
1338 }
1339 packed_w += cr - cr_block_size;
1340 }
1341 }
Marat Dukhan97262462021-06-18 16:14:17 -07001342 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -07001343 }
1344}
1345
Marat Dukhan08b7a972020-07-14 18:17:29 -07001346void xnn_pack_qu8_dwconv_hwg_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001347 size_t h,
1348 size_t w,
1349 size_t c,
1350 size_t cr,
Marat Dukhanab582382020-07-06 13:32:08 -07001351 const uint8_t* k,
1352 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001353 void* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -07001354 size_t extra_bytes,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001355 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001356{
Marat Dukhanb42f8662020-07-06 20:46:13 -07001357 const int32_t izp = (int32_t) params->input_zero_point;
1358 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001359 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1360 const size_t cr_block_size = min(c - cr_block_start, cr);
1361 int32_t* packed_b = (int32_t*) packed_w;
1362 if XNN_LIKELY(b != NULL) {
1363 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1364 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1365 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1366 }
1367 } else {
1368 size_t n = cr_block_size;
1369 do {
1370 *((int32_t*) packed_w) = boff;
1371 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1372 } while (--n != 0);
1373 }
1374 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1375 for (size_t x = 0; x < w; x++) {
1376 for (size_t y = 0; y < h; y++) {
1377 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1378 const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
Marat Dukhanb42f8662020-07-06 20:46:13 -07001379 packed_b[cr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001380 *((uint8_t*) packed_w) = kv;
1381 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1382 }
1383 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1384 }
1385 }
Marat Dukhan97262462021-06-18 16:14:17 -07001386 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanab582382020-07-06 13:32:08 -07001387 }
1388}
1389
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001390void xnn_pack_qs8_dwconv_hwg_w(
1391 size_t h,
1392 size_t w,
1393 size_t c,
1394 size_t cr,
1395 const int8_t* k,
1396 const int32_t* b,
1397 void* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -07001398 size_t extra_bytes,
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001399 const struct xnn_qs8_packing_params* params)
1400{
1401 const int32_t izp = (int32_t) params->input_zero_point;
1402 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1403 const size_t cr_block_size = min(c - cr_block_start, cr);
1404 int32_t* packed_b = (int32_t*) packed_w;
1405 if XNN_LIKELY(b != NULL) {
1406 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1407 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
1408 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1409 }
1410 } else {
1411 size_t n = cr_block_size;
1412 do {
1413 *((int32_t*) packed_w) = 0;
1414 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1415 } while (--n != 0);
1416 }
1417 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1418 for (size_t x = 0; x < w; x++) {
1419 for (size_t y = 0; y < h; y++) {
1420 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1421 const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1422 packed_b[cr_block_offset] -= (int32_t) kv * izp;
1423 *((int8_t*) packed_w) = kv;
1424 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1425 }
1426 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1427 }
1428 }
Marat Dukhan97262462021-06-18 16:14:17 -07001429 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001430 }
1431}
1432
Marat Dukhana6879bd2020-07-06 14:25:08 -07001433void xnn_pack_f32_gemminc_goi_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001434 size_t g,
1435 size_t nc,
1436 size_t kc,
1437 size_t nr,
1438 size_t kr,
1439 size_t sr,
Marat Dukhana6879bd2020-07-06 14:25:08 -07001440 const float* k,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001441 float* packed_w,
1442 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001443{
1444 const size_t skr = sr * kr;
1445 const size_t skc = round_down_po2(kc, skr);
1446 const size_t sr_mask = (sr - 1) * kr;
1447 do {
1448 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1449 const size_t nr_block_size = min(nc - nr_block_start, nr);
Marat Dukhanab582382020-07-06 13:32:08 -07001450
1451 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1452 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1453 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1454 *packed_w++ =
1455 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1456 }
1457 }
1458 packed_w += (nr - nr_block_size) * kr;
1459 }
1460
1461 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1462 const size_t kr_block_size = min(kc - kr_block_start, kr);
1463 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1464 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1465 *packed_w++ =
1466 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1467 }
1468 packed_w += kr - kr_block_size;
1469 }
1470 packed_w += (nr - nr_block_size) * kr;
1471 }
1472 }
1473 k += nc * kc;
Marat Dukhanab582382020-07-06 13:32:08 -07001474 } while (--g != 0);
1475}
1476
Marat Dukhanab582382020-07-06 13:32:08 -07001477void xnn_pack_f16_gemminc_goi_w(
1478 size_t g,
1479 size_t nc,
1480 size_t kc,
1481 size_t nr,
1482 size_t kr,
1483 size_t sr,
1484 const uint16_t* k,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001485 uint16_t* packed_w,
1486 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001487{
1488 const size_t skr = sr * kr;
1489 const size_t skc = round_down_po2(kc, skr);
1490 const size_t sr_mask = (sr - 1) * kr;
1491 do {
1492 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1493 const size_t nr_block_size = min(nc - nr_block_start, nr);
1494
1495 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1496 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1497 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1498 *packed_w++ =
1499 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1500 }
1501 }
1502 packed_w += (nr - nr_block_size) * kr;
1503 }
1504
1505 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1506 const size_t kr_block_size = min(kc - kr_block_start, kr);
1507 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1508 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1509 *packed_w++ =
1510 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1511 }
1512 packed_w += kr - kr_block_size;
1513 }
1514 packed_w += (nr - nr_block_size) * kr;
1515 }
1516 }
1517 k += nc * kc;
1518 } while (--g != 0);
1519}
1520
Marat Dukhana6879bd2020-07-06 14:25:08 -07001521void xnn_pack_f32_dconv_oki_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001522 size_t nc,
Marat Dukhanab582382020-07-06 13:32:08 -07001523 size_t kc,
1524 size_t nr,
Marat Dukhana6879bd2020-07-06 14:25:08 -07001525 size_t kh,
1526 size_t kw,
1527 const float* k,
1528 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001529 float* packed_w,
1530 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001531{
Marat Dukhana6879bd2020-07-06 14:25:08 -07001532 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1533 const size_t nr_block_size = min(nc - nr_block_start, nr);
1534 if XNN_LIKELY(b != NULL) {
1535 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1536 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
Marat Dukhanab582382020-07-06 13:32:08 -07001537 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001538 } else {
1539 size_t n = nr;
1540 do {
1541 *packed_w++ = 0.0f;
1542 } while (--n != 0);
1543 }
Marat Dukhanab582382020-07-06 13:32:08 -07001544
Marat Dukhana6879bd2020-07-06 14:25:08 -07001545 for (size_t kx = 0; kx < kw; kx++) {
1546 for (size_t c = 0; c < kc; c++) {
1547 for (size_t ky = 0; ky < kh; ky++) {
1548 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1549 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
Marat Dukhanab582382020-07-06 13:32:08 -07001550 }
Marat Dukhanab582382020-07-06 13:32:08 -07001551 }
1552 }
1553 }
Marat Dukhanab582382020-07-06 13:32:08 -07001554 if XNN_UNPREDICTABLE(b != NULL) {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001555 b += nr;
Marat Dukhanab582382020-07-06 13:32:08 -07001556 }
1557 }
1558}
1559
1560void xnn_pack_f16_dconv_oki_w(
1561 size_t nc,
1562 size_t kc,
1563 size_t nr,
1564 size_t kh,
1565 size_t kw,
1566 const uint16_t* k,
1567 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001568 uint16_t* packed_w,
1569 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001570{
1571 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1572 const size_t nr_block_size = min(nc - nr_block_start, nr);
1573 if XNN_LIKELY(b != NULL) {
1574 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1575 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1576 }
1577 } else {
1578 size_t n = nr;
1579 do {
1580 *packed_w++ = 0;
1581 } while (--n != 0);
1582 }
1583
1584 for (size_t kx = 0; kx < kw; kx++) {
1585 for (size_t c = 0; c < kc; c++) {
1586 for (size_t ky = 0; ky < kh; ky++) {
1587 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1588 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1589 }
1590 }
1591 }
1592 }
1593 if XNN_UNPREDICTABLE(b != NULL) {
1594 b += nr;
1595 }
1596 }
1597}
1598
Marat Dukhana6879bd2020-07-06 14:25:08 -07001599void xnn_pack_f32_chw_dwconv_ghw_w(
1600 size_t kernel_size,
1601 size_t groups,
1602 const float* kernel,
1603 const float* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001604 float* packed_weights,
1605 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001606{
Marat Dukhana6879bd2020-07-06 14:25:08 -07001607 for (size_t g = 0; g < groups; g++) {
1608 if XNN_LIKELY(bias != NULL) {
1609 *packed_weights = *bias++;
Marat Dukhanab582382020-07-06 13:32:08 -07001610 } else {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001611 *packed_weights = 0.0f;
Marat Dukhanab582382020-07-06 13:32:08 -07001612 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001613 packed_weights += 1;
1614 for (size_t i = 0; i < kernel_size; i++) {
1615 *packed_weights++ = kernel[g * kernel_size + i];
Marat Dukhanab582382020-07-06 13:32:08 -07001616 }
1617 }
1618}
1619
1620void xnn_pack_f16_chw_dwconv_ghw_w(
1621 size_t kernel_size,
1622 size_t groups,
1623 const uint16_t* kernel,
1624 const uint16_t* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001625 uint16_t* packed_weights,
1626 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001627{
1628 for (size_t g = 0; g < groups; g++) {
1629 if XNN_LIKELY(bias != NULL) {
1630 *packed_weights = *bias++;
1631 } else {
1632 *packed_weights = 0;
1633 }
1634 packed_weights += 1;
1635 for (size_t i = 0; i < kernel_size; i++) {
1636 *packed_weights++ = kernel[g * kernel_size + i];
1637 }
1638 }
1639}
1640
Marat Dukhanab582382020-07-06 13:32:08 -07001641void xnn_pack_f32_chw_dwconv_hwg_w(
1642 size_t kernel_size,
1643 size_t groups,
1644 const float* kernel,
1645 const float* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001646 float* packed_weights,
1647 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001648{
1649 for (size_t g = 0; g < groups; g++) {
1650 if XNN_LIKELY(bias != NULL) {
1651 *packed_weights = *bias++;
1652 } else {
1653 *packed_weights = 0.0f;
1654 }
1655 packed_weights += 1;
1656 for (size_t i = 0; i < kernel_size; i++) {
1657 *packed_weights++ = kernel[i * groups + g];
1658 }
1659 }
1660}
1661
1662void xnn_pack_f32_vmulcaddc_w(
1663 size_t c,
1664 size_t cr,
1665 const float* s,
1666 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001667 float* packed_w,
1668 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001669{
1670 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1671 const size_t cr_block_size = min(c - cr_block_start, cr);
1672 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1673 *packed_w++ = s[cr_block_start + cr_block_offset];
1674 }
1675 packed_w += cr - cr_block_size;
1676 if XNN_LIKELY(b != NULL) {
1677 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1678 *packed_w++ = b[cr_block_start + cr_block_offset];
1679 }
1680 } else {
1681 size_t n = cr_block_size;
1682 do {
1683 *packed_w++ = 0.0f;
1684 } while (--n != 0);
1685 }
1686 packed_w += cr - cr_block_size;
1687 }
1688}
1689
1690void xnn_pack_f16_vmulcaddc_w(
1691 size_t c,
1692 size_t cr,
1693 const uint16_t* s,
1694 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001695 uint16_t* packed_w,
1696 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001697{
1698 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1699 const size_t cr_block_size = min(c - cr_block_start, cr);
1700 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1701 *packed_w++ = s[cr_block_start + cr_block_offset];
1702 }
1703 packed_w += cr - cr_block_size;
1704 if XNN_LIKELY(b != NULL) {
1705 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1706 *packed_w++ = b[cr_block_start + cr_block_offset];
1707 }
1708 } else {
1709 size_t n = cr_block_size;
1710 do {
1711 *packed_w++ = 0;
1712 } while (--n != 0);
1713 }
1714 packed_w += cr - cr_block_size;
1715 }
1716}