blob: 3a61d4d062ff4a5291b32e622904839316344068 [file] [log] [blame]
Marat Dukhanab582382020-07-06 13:32:08 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <stdint.h>
10#include <stddef.h>
11
12#include <xnnpack/math.h>
13#include <xnnpack/pack.h>
14
15
Marat Dukhana6879bd2020-07-06 14:25:08 -070016void xnn_pack_f32_gemm_goi_w(
17 size_t g,
18 size_t nc,
19 size_t kc,
20 size_t nr,
21 size_t kr,
22 size_t sr,
23 const float* k,
24 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -070025 float* packed_w,
26 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -070027{
28 const size_t skr = sr * kr;
29 const size_t skc = round_down_po2(kc, skr);
30 const size_t sr_mask = (sr - 1) * kr;
31 do {
32 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
33 const size_t nr_block_size = min(nc - nr_block_start, nr);
34 if XNN_LIKELY(b != NULL) {
35 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
36 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
37 }
38 }
39 packed_w += nr;
40
41 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
42 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
43 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
44 *packed_w++ =
45 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
46 }
47 }
48 packed_w += (nr - nr_block_size) * kr;
49 }
50
51 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
52 const size_t kr_block_size = min(kc - kr_block_start, kr);
53 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
54 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
55 *packed_w++ =
56 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
57 }
58 packed_w += kr - kr_block_size;
59 }
60 packed_w += (nr - nr_block_size) * kr;
61 }
62 }
63 k += nc * kc;
64 if XNN_UNPREDICTABLE(b != NULL) {
65 b += nc;
66 }
67 } while (--g != 0);
68}
69
70void xnn_pack_f16_gemm_goi_w(
71 size_t g,
72 size_t nc,
73 size_t kc,
74 size_t nr,
75 size_t kr,
76 size_t sr,
77 const uint16_t* k,
78 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -070079 uint16_t* packed_w,
80 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -070081{
82 const size_t skr = sr * kr;
83 const size_t skc = round_down_po2(kc, skr);
84 const size_t sr_mask = (sr - 1) * kr;
85 do {
86 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
87 const size_t nr_block_size = min(nc - nr_block_start, nr);
88 if XNN_LIKELY(b != NULL) {
89 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
90 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
91 }
92 }
93 packed_w += nr;
94
95 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
96 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
97 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
98 *packed_w++ =
99 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
100 }
101 }
102 packed_w += (nr - nr_block_size) * kr;
103 }
104
105 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
106 const size_t kr_block_size = min(kc - kr_block_start, kr);
107 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
108 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
109 *packed_w++ =
110 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
111 }
112 packed_w += kr - kr_block_size;
113 }
114 packed_w += (nr - nr_block_size) * kr;
115 }
116 }
117 k += nc * kc;
118 if XNN_UNPREDICTABLE(b != NULL) {
119 b += nc;
120 }
121 } while (--g != 0);
122}
123
Marat Dukhanab582382020-07-06 13:32:08 -0700124void xnn_pack_q8_gemm_goi_w(
125 size_t g,
126 size_t nc,
127 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700128 size_t nr,
129 size_t kr,
Marat Dukhanbc08f312020-07-07 16:22:04 -0700130 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700131 const uint8_t* k,
132 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700133 void* packed_w,
134 const struct xnn_q8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700135{
Marat Dukhanbc08f312020-07-07 16:22:04 -0700136 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700137 const int32_t izp = (int32_t) params->input_zero_point;
138 const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700139 do {
140 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
141 const size_t nr_block_size = min(nc - nr_block_start, nr);
142 int32_t* packed_b = (int32_t*) packed_w;
143 if XNN_LIKELY(b != NULL) {
144 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
145 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
146 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
147 }
148 } else {
149 size_t n = nr_block_size;
150 do {
151 *((int32_t*) packed_w) = boff;
152 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
153 } while (--n != 0);
154 }
155 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
156 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
157 const size_t kr_block_size = min(kc - kr_block_start, kr);
158 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
159 int32_t ksum = 0;
160 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
161 const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
162 ksum += (int32_t) kv;
163 *((uint8_t*) packed_w) = kv;
164 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
165 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700166 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700167 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
168 }
169 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
170 }
171 }
172 k += nc * kc;
173 if XNN_UNPREDICTABLE(b != NULL) {
174 b += nc;
175 }
176 } while (--g != 0);
177}
178
Marat Dukhana6879bd2020-07-06 14:25:08 -0700179void xnn_pack_f32_gemm_io_w(
180 size_t nc,
181 size_t kc,
182 size_t nr,
183 size_t kr,
184 size_t sr,
185 const float* k,
186 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700187 float* packed_w,
188 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700189{
190 const size_t skr = sr * kr;
191 const size_t skc = round_down_po2(kc, skr);
192 const size_t sr_mask = (sr - 1) * kr;
193 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
194 const size_t nr_block_size = min(nc - nr_block_start, nr);
195 if XNN_LIKELY(b != NULL) {
196 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
197 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
198 }
199 }
200 packed_w += nr;
201
202 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
203 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
204 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
205 *packed_w++ =
206 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
207 }
208 }
209 packed_w += (nr - nr_block_size) * kr;
210 }
211
212 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
213 const size_t kr_block_size = min(kc - kr_block_start, kr);
214 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
215 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
216 *packed_w++ =
217 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
218 }
219 packed_w += kr - kr_block_size;
220 }
221 packed_w += (nr - nr_block_size) * kr;
222 }
223 }
224}
225
226void xnn_pack_f16_gemm_io_w(
227 size_t nc,
228 size_t kc,
229 size_t nr,
230 size_t kr,
231 size_t sr,
232 const uint16_t* k,
233 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700234 uint16_t* packed_w,
235 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700236{
237 const size_t skr = sr * kr;
238 const size_t skc = round_down_po2(kc, skr);
239 const size_t sr_mask = (sr - 1) * kr;
240 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
241 const size_t nr_block_size = min(nc - nr_block_start, nr);
242 if XNN_LIKELY(b != NULL) {
243 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
244 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
245 }
246 }
247 packed_w += nr;
248
249 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
250 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
251 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
252 *packed_w++ =
253 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
254 }
255 }
256 packed_w += (nr - nr_block_size) * kr;
257 }
258
259 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
260 const size_t kr_block_size = min(kc - kr_block_start, kr);
261 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
262 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
263 *packed_w++ =
264 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
265 }
266 packed_w += kr - kr_block_size;
267 }
268 packed_w += (nr - nr_block_size) * kr;
269 }
270 }
271}
272
Marat Dukhanab582382020-07-06 13:32:08 -0700273void xnn_pack_q8_gemm_io_w(
274 size_t nc,
275 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700276 size_t nr,
277 size_t kr,
Marat Dukhanbc08f312020-07-07 16:22:04 -0700278 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700279 const uint8_t* k,
280 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700281 void* packed_w,
282 const struct xnn_q8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700283{
Marat Dukhanbc08f312020-07-07 16:22:04 -0700284 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700285 const int32_t izp = (int32_t) params->input_zero_point;
286 const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700287 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
288 const size_t nr_block_size = min(nc - nr_block_start, nr);
289 int32_t* packed_b = (int32_t*) packed_w;
290 if XNN_LIKELY(b != NULL) {
291 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
292 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
293 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
294 }
295 } else {
296 size_t n = nr_block_size;
297 do {
298 *((int32_t*) packed_w) = boff;
299 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
300 } while (--n != 0);
301 }
302 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
303 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
304 const size_t kr_block_size = min(kc - kr_block_start, kr);
305 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
306 int32_t ksum = 0;
307 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
308 const uint8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
309 ksum += (int32_t) kv;
310 *((uint8_t*) packed_w) = kv;
311 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
312 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700313 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700314 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
315 }
316 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
317 }
318 }
319}
320
Marat Dukhana6879bd2020-07-06 14:25:08 -0700321void xnn_pack_f32_conv_goki_w(
322 size_t g,
323 size_t nc,
324 size_t ks,
325 size_t kc,
326 size_t nr,
327 size_t kr,
328 size_t sr,
329 const float* k,
330 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700331 float* packed_w,
332 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700333{
334 const size_t skr = sr * kr;
335 const size_t skc = round_down_po2(kc, skr);
336 const size_t sr_mask = (sr - 1) * kr;
337 do {
338 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
339 const size_t nr_block_size = min(nc - nr_block_start, nr);
340 if XNN_LIKELY(b != NULL) {
341 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
342 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
343 }
344 }
345 packed_w += nr;
346
347 for (size_t ki = 0; ki < ks; ki++) {
348 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
349 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
350 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
351 *packed_w++ =
352 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
353 }
354 }
355 packed_w += (nr - nr_block_size) * kr;
356 }
357
358 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
359 const size_t kr_block_size = min(kc - kr_block_start, kr);
360 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
361 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
362 *packed_w++ =
363 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
364 }
365 packed_w += kr - kr_block_size;
366 }
367 packed_w += (nr - nr_block_size) * kr;
368 }
369 }
370 }
371 k += ks * kc * nc;
372 if XNN_UNPREDICTABLE(b != NULL) {
373 b += nc;
374 }
375 } while (--g != 0);
376}
377
378void xnn_pack_f16_conv_goki_w(
379 size_t g,
380 size_t nc,
381 size_t ks,
382 size_t kc,
383 size_t nr,
384 size_t kr,
385 size_t sr,
386 const uint16_t* k,
387 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700388 uint16_t* packed_w,
389 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700390{
391 const size_t skr = sr * kr;
392 const size_t skc = round_down_po2(kc, skr);
393 const size_t sr_mask = (sr - 1) * kr;
394 do {
395 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
396 const size_t nr_block_size = min(nc - nr_block_start, nr);
397 if XNN_LIKELY(b != NULL) {
398 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
399 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
400 }
401 }
402 packed_w += nr;
403
404 for (size_t ki = 0; ki < ks; ki++) {
405 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
406 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
407 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
408 *packed_w++ =
409 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
410 }
411 }
412 packed_w += (nr - nr_block_size) * kr;
413 }
414
415 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
416 const size_t kr_block_size = min(kc - kr_block_start, kr);
417 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
418 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
419 *packed_w++ =
420 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
421 }
422 packed_w += kr - kr_block_size;
423 }
424 packed_w += (nr - nr_block_size) * kr;
425 }
426 }
427 }
428 k += ks * kc * nc;
429 if XNN_UNPREDICTABLE(b != NULL) {
430 b += nc;
431 }
432 } while (--g != 0);
433}
434
Marat Dukhanab582382020-07-06 13:32:08 -0700435void xnn_pack_q8_conv_goki_w(
436 size_t g,
437 size_t nc,
438 size_t ks,
439 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700440 size_t nr,
441 size_t kr,
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700442 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700443 const uint8_t* k,
444 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700445 void* packed_w,
446 const struct xnn_q8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700447{
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700448 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700449 const int32_t izp = (int32_t) params->input_zero_point;
450 const int32_t boff = (int32_t) ks * (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700451 do {
452 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
453 const size_t nr_block_size = min(nc - nr_block_start, nr);
454 int32_t* packed_b = (int32_t*) packed_w;
455 if XNN_LIKELY(b != NULL) {
456 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
457 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
458 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
459 }
460 } else {
461 size_t n = nr_block_size;
462 do {
463 *((int32_t*) packed_w) = boff;
464 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
465 } while (--n != 0);
466 }
467 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
468 for (size_t ki = 0; ki < ks; ki++) {
469 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
470 const size_t kr_block_size = min(kc - kr_block_start, kr);
471 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
472 int32_t ksum = 0;
473 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
474 const uint8_t kv =
475 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
476 ksum += (int32_t) kv;
477 *((uint8_t*) packed_w) = kv;
478 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
479 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700480 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700481 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
482 }
483 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
484 }
485 }
486 }
487 k += ks * kc * nc;
488 if XNN_UNPREDICTABLE(b != NULL) {
489 b += nc;
490 }
491 } while (--g != 0);
492}
493
Marat Dukhana6879bd2020-07-06 14:25:08 -0700494void xnn_pack_f32_conv_kgo_w(
495 size_t g,
496 size_t nc,
497 size_t ks,
498 size_t nr,
499 size_t kr,
500 const float* k,
501 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700502 float* packed_w,
503 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700504{
505 for (size_t i = 0; i < g; i++) {
506 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
507 const size_t nr_block_size = min(nc - nr_block_start, nr);
508 if XNN_LIKELY(b != NULL) {
509 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
510 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
511 }
512 }
513 packed_w += nr;
514 for (size_t ki = 0; ki < ks; ki++) {
515 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
516 *packed_w =
517 k[ki * g * nc + (nr_block_start + nr_block_offset)];
518 packed_w += kr;
519 }
520 packed_w += (nr - nr_block_size) * kr;
521 }
522 }
523 k += nc;
524 if XNN_UNPREDICTABLE(b != NULL) {
525 b += nc;
526 }
527 }
528}
529
530void xnn_pack_f16_conv_kgo_w(
531 size_t g,
532 size_t nc,
533 size_t ks,
534 size_t nr,
535 size_t kr,
536 const uint16_t* k,
537 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700538 uint16_t* packed_w,
539 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700540{
541 for (size_t i = 0; i < g; i++) {
542 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
543 const size_t nr_block_size = min(nc - nr_block_start, nr);
544 if XNN_LIKELY(b != NULL) {
545 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
546 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
547 }
548 }
549 packed_w += nr;
550 for (size_t ki = 0; ki < ks; ki++) {
551 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
552 *packed_w =
553 k[ki * g * nc + (nr_block_start + nr_block_offset)];
554 packed_w += kr;
555 }
556 packed_w += (nr - nr_block_size) * kr;
557 }
558 }
559 k += nc;
560 if XNN_UNPREDICTABLE(b != NULL) {
561 b += nc;
562 }
563 }
564}
565
Marat Dukhanab582382020-07-06 13:32:08 -0700566void xnn_pack_q8_conv_kgo_w(
567 size_t g,
568 size_t nc,
569 size_t ks,
Marat Dukhana6879bd2020-07-06 14:25:08 -0700570 size_t nr,
571 size_t kr,
Marat Dukhanab582382020-07-06 13:32:08 -0700572 const uint8_t* k,
573 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700574 void* packed_w,
575 const struct xnn_q8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700576{
Marat Dukhanb42f8662020-07-06 20:46:13 -0700577 const int32_t izp = (int32_t) params->input_zero_point;
578 const int32_t boff = (int32_t) ks * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700579 for (size_t i = 0; i < g; i++) {
580 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
581 const size_t nr_block_size = min(nc - nr_block_start, nr);
582 int32_t* packed_b = (int32_t*) packed_w;
583 if XNN_LIKELY(b != NULL) {
584 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
585 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
586 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
587 }
588 } else {
589 size_t n = nr_block_size;
590 do {
591 *((int32_t*) packed_w) = boff;
592 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
593 } while (--n != 0);
594 }
595 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
596 for (size_t ki = 0; ki < ks; ki++) {
597 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
598 const uint8_t kv =
599 k[ki * g * nc + (nr_block_start + nr_block_offset)];
600 *((uint8_t*) packed_w) = kv;
Marat Dukhanb42f8662020-07-06 20:46:13 -0700601 packed_b[nr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700602 packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(uint8_t));
603 }
604 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
605 }
606 }
607 k += nc;
608 if XNN_UNPREDICTABLE(b != NULL) {
609 b += nc;
610 }
611 }
612}
613
Marat Dukhana6879bd2020-07-06 14:25:08 -0700614void xnn_pack_f32_deconv_goki_w(
615 size_t g,
616 size_t nc,
617 size_t kh,
618 size_t kw,
619 size_t kc,
620 size_t sh,
621 size_t sw,
622 size_t nr,
623 size_t kr,
624 size_t sr,
625 const float* k,
626 const float* b,
627 float* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700628 struct subconvolution_params* subconv_params,
629 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700630{
631 const size_t skr = sr * kr;
632 const size_t skc = round_down_po2(kc, skr);
633 const size_t sr_mask = (sr - 1) * kr;
634 for (size_t i = 0; i < g; i++) {
635 for (size_t oy = 0; oy < sh; oy++) {
636 for (size_t ox = 0; ox < sw; ox++) {
637 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -0700638 (*subconv_params++).weights = packed_w;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700639 }
640 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
641 const size_t nr_block_size = min(nc - nr_block_start, nr);
642 if XNN_LIKELY(b != NULL) {
643 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
644 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
645 }
646 }
647 packed_w += nr;
648 for (size_t ky = oy; ky < kh; ky += sh) {
649 for (size_t kx = ox; kx < kw; kx += sw) {
650 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
651 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
652 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
653 *packed_w++ =
654 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
655 }
656 }
657 packed_w += (nr - nr_block_size) * kr;
658 }
659
660 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
661 const size_t kr_block_size = min(kc - kr_block_start, kr);
662 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
663 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
664 *packed_w++ =
665 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
666 }
667 packed_w += kr - kr_block_size;
668 }
669 packed_w += (nr - nr_block_size) * kr;
670 }
671 }
672 }
673 }
674 }
675 }
676 k += kh * kw * kc * nc;
677 if XNN_UNPREDICTABLE(b != NULL) {
678 b += nc;
679 }
680 }
681}
682
683void xnn_pack_f16_deconv_goki_w(
684 size_t g,
685 size_t nc,
686 size_t kh,
687 size_t kw,
688 size_t kc,
689 size_t sh,
690 size_t sw,
691 size_t nr,
692 size_t kr,
693 size_t sr,
694 const uint16_t* k,
695 const uint16_t* b,
696 uint16_t* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700697 struct subconvolution_params* subconv_params,
698 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700699{
700 const size_t skr = sr * kr;
701 const size_t skc = round_down_po2(kc, skr);
702 const size_t sr_mask = (sr - 1) * kr;
703 for (size_t i = 0; i < g; i++) {
704 for (size_t oy = 0; oy < sh; oy++) {
705 for (size_t ox = 0; ox < sw; ox++) {
706 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -0700707 (*subconv_params++).weights = packed_w;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700708 }
709 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
710 const size_t nr_block_size = min(nc - nr_block_start, nr);
711 if XNN_LIKELY(b != NULL) {
712 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
713 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
714 }
715 }
716 packed_w += nr;
717 for (size_t ky = oy; ky < kh; ky += sh) {
718 for (size_t kx = ox; kx < kw; kx += sw) {
719 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
720 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
721 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
722 *packed_w++ =
723 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
724 }
725 }
726 packed_w += (nr - nr_block_size) * kr;
727 }
728
729 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
730 const size_t kr_block_size = min(kc - kr_block_start, kr);
731 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
732 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
733 *packed_w++ =
734 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
735 }
736 packed_w += kr - kr_block_size;
737 }
738 packed_w += (nr - nr_block_size) * kr;
739 }
740 }
741 }
742 }
743 }
744 }
745 k += kh * kw * kc * nc;
746 if XNN_UNPREDICTABLE(b != NULL) {
747 b += nc;
748 }
749 }
750}
751
Marat Dukhanab582382020-07-06 13:32:08 -0700752void xnn_pack_q8_deconv_goki_w(
753 size_t g,
754 size_t nc,
755 size_t kh,
756 size_t kw,
757 size_t kc,
758 size_t sh,
759 size_t sw,
760 size_t nr,
761 size_t kr,
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700762 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700763 const uint8_t* k,
764 const int32_t* b,
765 void* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700766 struct subconvolution_params* subconv_params,
767 const struct xnn_q8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700768{
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700769 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700770 const int32_t izp = (int32_t) params->input_zero_point;
771 const int32_t kzp = (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700772 for (size_t i = 0; i < g; i++) {
773 for (size_t oy = 0; oy < sh; oy++) {
774 for (size_t ox = 0; ox < sw; ox++) {
775 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -0700776 (*subconv_params++).weights = packed_w;
Marat Dukhanab582382020-07-06 13:32:08 -0700777 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700778 const int32_t boff = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * izp * kzp;
Marat Dukhanab582382020-07-06 13:32:08 -0700779 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
780 const size_t nr_block_size = min(nc - nr_block_start, nr);
781 int32_t* packed_b = (int32_t*) packed_w;
782 if XNN_LIKELY(b != 0) {
783 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
784 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
785 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
786 }
787 } else {
788 size_t n = nr_block_size;
789 do {
790 *((int32_t*) packed_w) = boff;
791 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
792 } while (--n != 0);
793 }
794 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
795 for (size_t ky = oy; ky < kh; ky += sh) {
796 for (size_t kx = ox; kx < kw; kx += sw) {
797 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
798 const size_t kr_block_size = min(kc - kr_block_start, kr);
799 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
800 int32_t ksum = 0;
801 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
802 const uint8_t kv =
803 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
804 ksum += (int32_t) kv;
805 *((uint8_t*) packed_w) = kv;
806 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
807 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700808 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700809 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
810 }
811 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
812 }
813 }
814 }
815 }
816 }
817 }
818 k += kh * kw * kc * nc;
819 if XNN_UNPREDICTABLE(b != NULL) {
820 b += nc;
821 }
822 }
823}
824
Marat Dukhana6879bd2020-07-06 14:25:08 -0700825void xnn_pack_f32_dwconv_ghw_w(
826 size_t h,
827 size_t w,
828 size_t c,
829 size_t cr,
830 const float* k,
831 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700832 float* packed_w,
833 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700834{
835 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
836 const size_t cr_block_size = min(c - cr_block_start, cr);
837 if XNN_LIKELY(b != NULL) {
838 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
839 *packed_w++ = b[cr_block_start + cr_block_offset];
840 }
841 } else {
842 size_t n = cr_block_size;
843 do {
844 *packed_w++ = 0.0f;
845 } while (--n != 0);
846 }
847 packed_w += cr - cr_block_size;
848 for (size_t x = 0; x < w; x++) {
849 for (size_t y = 0; y < h; y++) {
850 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
851 const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
852 *packed_w++ = kv;
853 }
854 packed_w += cr - cr_block_size;
855 }
856 }
857 }
858}
859
860void xnn_pack_f16_dwconv_ghw_w(
861 size_t h,
862 size_t w,
863 size_t c,
864 size_t cr,
865 const uint16_t* k,
866 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700867 uint16_t* packed_w,
868 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700869{
870 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
871 const size_t cr_block_size = min(c - cr_block_start, cr);
872 if XNN_LIKELY(b != NULL) {
873 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
874 *packed_w++ = b[cr_block_start + cr_block_offset];
875 }
876 } else {
877 size_t n = cr_block_size;
878 do {
879 *packed_w++ = 0;
880 } while (--n != 0);
881 }
882 packed_w += cr - cr_block_size;
883 for (size_t x = 0; x < w; x++) {
884 for (size_t y = 0; y < h; y++) {
885 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
886 const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
887 *packed_w++ = kv;
888 }
889 packed_w += cr - cr_block_size;
890 }
891 }
892 }
893}
894
Marat Dukhanab582382020-07-06 13:32:08 -0700895void xnn_pack_q8_dwconv_ghw_w(
896 size_t h,
897 size_t w,
898 size_t c,
899 size_t cr,
Marat Dukhanab582382020-07-06 13:32:08 -0700900 const uint8_t* k,
901 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700902 void* packed_w,
903 const struct xnn_q8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700904{
Marat Dukhanb42f8662020-07-06 20:46:13 -0700905 const int32_t izp = (int32_t) params->input_zero_point;
906 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700907 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
908 const size_t cr_block_size = min(c - cr_block_start, cr);
909 int32_t* packed_b = (int32_t*) packed_w;
910 if XNN_LIKELY(b != NULL) {
911 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
912 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
913 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
914 }
915 } else {
916 size_t n = cr_block_size;
917 do {
918 *((int32_t*) packed_w) = boff;
919 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
920 } while (--n != 0);
921 }
922 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
923 for (size_t x = 0; x < w; x++) {
924 for (size_t y = 0; y < h; y++) {
925 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
926 const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
Marat Dukhanb42f8662020-07-06 20:46:13 -0700927 packed_b[cr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700928 *((uint8_t*) packed_w) = kv;
929 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
930 }
931 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
932 }
933 }
934 }
935}
936
Marat Dukhana6879bd2020-07-06 14:25:08 -0700937void xnn_pack_f32_dwconv_hwg_w(
938 size_t h,
939 size_t w,
940 size_t c,
941 size_t cr,
942 const float* k,
943 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700944 float* packed_w,
945 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700946{
947 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
948 const size_t cr_block_size = min(c - cr_block_start, cr);
949 if XNN_LIKELY(b != NULL) {
950 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
951 *packed_w++ = b[cr_block_start + cr_block_offset];
952 }
953 } else {
954 size_t n = cr_block_size;
955 do {
956 *packed_w++ = 0.0f;
957 } while (--n != 0);
958 }
959 packed_w += cr - cr_block_size;
960 for (size_t x = 0; x < w; x++) {
961 for (size_t y = 0; y < h; y++) {
962 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
963 const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
964 *packed_w++ = kv;
965 }
966 packed_w += cr - cr_block_size;
967 }
968 }
969 }
970}
971
972void xnn_pack_f16_dwconv_hwg_w(
973 size_t h,
974 size_t w,
975 size_t c,
976 size_t cr,
977 const uint16_t* k,
978 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700979 uint16_t* packed_w,
980 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700981{
982 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
983 const size_t cr_block_size = min(c - cr_block_start, cr);
984 if XNN_LIKELY(b != NULL) {
985 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
986 *packed_w++ = b[cr_block_start + cr_block_offset];
987 }
988 } else {
989 size_t n = cr_block_size;
990 do {
991 *packed_w++ = 0;
992 } while (--n != 0);
993 }
994 packed_w += cr - cr_block_size;
995 for (size_t x = 0; x < w; x++) {
996 for (size_t y = 0; y < h; y++) {
997 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
998 const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
999 *packed_w++ = kv;
1000 }
1001 packed_w += cr - cr_block_size;
1002 }
1003 }
1004 }
1005}
1006
Marat Dukhanab582382020-07-06 13:32:08 -07001007void xnn_pack_q8_dwconv_hwg_w(
1008 size_t h,
1009 size_t w,
1010 size_t c,
1011 size_t cr,
Marat Dukhanab582382020-07-06 13:32:08 -07001012 const uint8_t* k,
1013 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001014 void* packed_w,
1015 const struct xnn_q8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001016{
Marat Dukhanb42f8662020-07-06 20:46:13 -07001017 const int32_t izp = (int32_t) params->input_zero_point;
1018 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001019 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1020 const size_t cr_block_size = min(c - cr_block_start, cr);
1021 int32_t* packed_b = (int32_t*) packed_w;
1022 if XNN_LIKELY(b != NULL) {
1023 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1024 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1025 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1026 }
1027 } else {
1028 size_t n = cr_block_size;
1029 do {
1030 *((int32_t*) packed_w) = boff;
1031 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1032 } while (--n != 0);
1033 }
1034 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1035 for (size_t x = 0; x < w; x++) {
1036 for (size_t y = 0; y < h; y++) {
1037 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1038 const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
Marat Dukhanb42f8662020-07-06 20:46:13 -07001039 packed_b[cr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001040 *((uint8_t*) packed_w) = kv;
1041 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1042 }
1043 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1044 }
1045 }
1046 }
1047}
1048
Marat Dukhana6879bd2020-07-06 14:25:08 -07001049void xnn_pack_f32_gemminc_goi_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001050 size_t g,
1051 size_t nc,
1052 size_t kc,
1053 size_t nr,
1054 size_t kr,
1055 size_t sr,
Marat Dukhana6879bd2020-07-06 14:25:08 -07001056 const float* k,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001057 float* packed_w,
1058 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001059{
1060 const size_t skr = sr * kr;
1061 const size_t skc = round_down_po2(kc, skr);
1062 const size_t sr_mask = (sr - 1) * kr;
1063 do {
1064 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1065 const size_t nr_block_size = min(nc - nr_block_start, nr);
Marat Dukhanab582382020-07-06 13:32:08 -07001066
1067 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1068 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1069 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1070 *packed_w++ =
1071 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1072 }
1073 }
1074 packed_w += (nr - nr_block_size) * kr;
1075 }
1076
1077 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1078 const size_t kr_block_size = min(kc - kr_block_start, kr);
1079 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1080 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1081 *packed_w++ =
1082 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1083 }
1084 packed_w += kr - kr_block_size;
1085 }
1086 packed_w += (nr - nr_block_size) * kr;
1087 }
1088 }
1089 k += nc * kc;
Marat Dukhanab582382020-07-06 13:32:08 -07001090 } while (--g != 0);
1091}
1092
Marat Dukhanab582382020-07-06 13:32:08 -07001093void xnn_pack_f16_gemminc_goi_w(
1094 size_t g,
1095 size_t nc,
1096 size_t kc,
1097 size_t nr,
1098 size_t kr,
1099 size_t sr,
1100 const uint16_t* k,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001101 uint16_t* packed_w,
1102 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001103{
1104 const size_t skr = sr * kr;
1105 const size_t skc = round_down_po2(kc, skr);
1106 const size_t sr_mask = (sr - 1) * kr;
1107 do {
1108 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1109 const size_t nr_block_size = min(nc - nr_block_start, nr);
1110
1111 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1112 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1113 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1114 *packed_w++ =
1115 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1116 }
1117 }
1118 packed_w += (nr - nr_block_size) * kr;
1119 }
1120
1121 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1122 const size_t kr_block_size = min(kc - kr_block_start, kr);
1123 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1124 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1125 *packed_w++ =
1126 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1127 }
1128 packed_w += kr - kr_block_size;
1129 }
1130 packed_w += (nr - nr_block_size) * kr;
1131 }
1132 }
1133 k += nc * kc;
1134 } while (--g != 0);
1135}
1136
Marat Dukhana6879bd2020-07-06 14:25:08 -07001137void xnn_pack_f32_dconv_oki_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001138 size_t nc,
Marat Dukhanab582382020-07-06 13:32:08 -07001139 size_t kc,
1140 size_t nr,
Marat Dukhana6879bd2020-07-06 14:25:08 -07001141 size_t kh,
1142 size_t kw,
1143 const float* k,
1144 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001145 float* packed_w,
1146 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001147{
Marat Dukhana6879bd2020-07-06 14:25:08 -07001148 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1149 const size_t nr_block_size = min(nc - nr_block_start, nr);
1150 if XNN_LIKELY(b != NULL) {
1151 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1152 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
Marat Dukhanab582382020-07-06 13:32:08 -07001153 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001154 } else {
1155 size_t n = nr;
1156 do {
1157 *packed_w++ = 0.0f;
1158 } while (--n != 0);
1159 }
Marat Dukhanab582382020-07-06 13:32:08 -07001160
Marat Dukhana6879bd2020-07-06 14:25:08 -07001161 for (size_t kx = 0; kx < kw; kx++) {
1162 for (size_t c = 0; c < kc; c++) {
1163 for (size_t ky = 0; ky < kh; ky++) {
1164 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1165 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
Marat Dukhanab582382020-07-06 13:32:08 -07001166 }
Marat Dukhanab582382020-07-06 13:32:08 -07001167 }
1168 }
1169 }
Marat Dukhanab582382020-07-06 13:32:08 -07001170 if XNN_UNPREDICTABLE(b != NULL) {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001171 b += nr;
Marat Dukhanab582382020-07-06 13:32:08 -07001172 }
1173 }
1174}
1175
1176void xnn_pack_f16_dconv_oki_w(
1177 size_t nc,
1178 size_t kc,
1179 size_t nr,
1180 size_t kh,
1181 size_t kw,
1182 const uint16_t* k,
1183 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001184 uint16_t* packed_w,
1185 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001186{
1187 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1188 const size_t nr_block_size = min(nc - nr_block_start, nr);
1189 if XNN_LIKELY(b != NULL) {
1190 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1191 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1192 }
1193 } else {
1194 size_t n = nr;
1195 do {
1196 *packed_w++ = 0;
1197 } while (--n != 0);
1198 }
1199
1200 for (size_t kx = 0; kx < kw; kx++) {
1201 for (size_t c = 0; c < kc; c++) {
1202 for (size_t ky = 0; ky < kh; ky++) {
1203 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1204 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1205 }
1206 }
1207 }
1208 }
1209 if XNN_UNPREDICTABLE(b != NULL) {
1210 b += nr;
1211 }
1212 }
1213}
1214
Marat Dukhana6879bd2020-07-06 14:25:08 -07001215void xnn_pack_f32_chw_dwconv_ghw_w(
1216 size_t kernel_size,
1217 size_t groups,
1218 const float* kernel,
1219 const float* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001220 float* packed_weights,
1221 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001222{
Marat Dukhana6879bd2020-07-06 14:25:08 -07001223 for (size_t g = 0; g < groups; g++) {
1224 if XNN_LIKELY(bias != NULL) {
1225 *packed_weights = *bias++;
Marat Dukhanab582382020-07-06 13:32:08 -07001226 } else {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001227 *packed_weights = 0.0f;
Marat Dukhanab582382020-07-06 13:32:08 -07001228 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001229 packed_weights += 1;
1230 for (size_t i = 0; i < kernel_size; i++) {
1231 *packed_weights++ = kernel[g * kernel_size + i];
Marat Dukhanab582382020-07-06 13:32:08 -07001232 }
1233 }
1234}
1235
1236void xnn_pack_f16_chw_dwconv_ghw_w(
1237 size_t kernel_size,
1238 size_t groups,
1239 const uint16_t* kernel,
1240 const uint16_t* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001241 uint16_t* packed_weights,
1242 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001243{
1244 for (size_t g = 0; g < groups; g++) {
1245 if XNN_LIKELY(bias != NULL) {
1246 *packed_weights = *bias++;
1247 } else {
1248 *packed_weights = 0;
1249 }
1250 packed_weights += 1;
1251 for (size_t i = 0; i < kernel_size; i++) {
1252 *packed_weights++ = kernel[g * kernel_size + i];
1253 }
1254 }
1255}
1256
Marat Dukhanab582382020-07-06 13:32:08 -07001257void xnn_pack_f32_chw_dwconv_hwg_w(
1258 size_t kernel_size,
1259 size_t groups,
1260 const float* kernel,
1261 const float* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001262 float* packed_weights,
1263 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001264{
1265 for (size_t g = 0; g < groups; g++) {
1266 if XNN_LIKELY(bias != NULL) {
1267 *packed_weights = *bias++;
1268 } else {
1269 *packed_weights = 0.0f;
1270 }
1271 packed_weights += 1;
1272 for (size_t i = 0; i < kernel_size; i++) {
1273 *packed_weights++ = kernel[i * groups + g];
1274 }
1275 }
1276}
1277
1278void xnn_pack_f32_vmulcaddc_w(
1279 size_t c,
1280 size_t cr,
1281 const float* s,
1282 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001283 float* packed_w,
1284 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001285{
1286 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1287 const size_t cr_block_size = min(c - cr_block_start, cr);
1288 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1289 *packed_w++ = s[cr_block_start + cr_block_offset];
1290 }
1291 packed_w += cr - cr_block_size;
1292 if XNN_LIKELY(b != NULL) {
1293 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1294 *packed_w++ = b[cr_block_start + cr_block_offset];
1295 }
1296 } else {
1297 size_t n = cr_block_size;
1298 do {
1299 *packed_w++ = 0.0f;
1300 } while (--n != 0);
1301 }
1302 packed_w += cr - cr_block_size;
1303 }
1304}
1305
1306void xnn_pack_f16_vmulcaddc_w(
1307 size_t c,
1308 size_t cr,
1309 const uint16_t* s,
1310 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001311 uint16_t* packed_w,
1312 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001313{
1314 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1315 const size_t cr_block_size = min(c - cr_block_start, cr);
1316 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1317 *packed_w++ = s[cr_block_start + cr_block_offset];
1318 }
1319 packed_w += cr - cr_block_size;
1320 if XNN_LIKELY(b != NULL) {
1321 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1322 *packed_w++ = b[cr_block_start + cr_block_offset];
1323 }
1324 } else {
1325 size_t n = cr_block_size;
1326 do {
1327 *packed_w++ = 0;
1328 } while (--n != 0);
1329 }
1330 packed_w += cr - cr_block_size;
1331 }
1332}