blob: 0868104d4a2c7e8e859c305296a5f9bec5c6b06b [file] [log] [blame]
Marat Dukhanab582382020-07-06 13:32:08 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <stdint.h>
10#include <stddef.h>
11
12#include <xnnpack/math.h>
13#include <xnnpack/pack.h>
14
15
Marat Dukhana6879bd2020-07-06 14:25:08 -070016void xnn_pack_f32_gemm_goi_w(
17 size_t g,
18 size_t nc,
19 size_t kc,
20 size_t nr,
21 size_t kr,
22 size_t sr,
23 const float* k,
24 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -070025 float* packed_w,
26 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -070027{
28 const size_t skr = sr * kr;
29 const size_t skc = round_down_po2(kc, skr);
30 const size_t sr_mask = (sr - 1) * kr;
31 do {
32 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
33 const size_t nr_block_size = min(nc - nr_block_start, nr);
34 if XNN_LIKELY(b != NULL) {
35 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
36 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
37 }
38 }
39 packed_w += nr;
40
41 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
42 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
43 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
44 *packed_w++ =
45 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
46 }
47 }
48 packed_w += (nr - nr_block_size) * kr;
49 }
50
51 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
52 const size_t kr_block_size = min(kc - kr_block_start, kr);
53 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
54 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
55 *packed_w++ =
56 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
57 }
58 packed_w += kr - kr_block_size;
59 }
60 packed_w += (nr - nr_block_size) * kr;
61 }
62 }
63 k += nc * kc;
64 if XNN_UNPREDICTABLE(b != NULL) {
65 b += nc;
66 }
67 } while (--g != 0);
68}
69
70void xnn_pack_f16_gemm_goi_w(
71 size_t g,
72 size_t nc,
73 size_t kc,
74 size_t nr,
75 size_t kr,
76 size_t sr,
77 const uint16_t* k,
78 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -070079 uint16_t* packed_w,
80 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -070081{
82 const size_t skr = sr * kr;
83 const size_t skc = round_down_po2(kc, skr);
84 const size_t sr_mask = (sr - 1) * kr;
85 do {
86 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
87 const size_t nr_block_size = min(nc - nr_block_start, nr);
88 if XNN_LIKELY(b != NULL) {
89 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
90 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
91 }
92 }
93 packed_w += nr;
94
95 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
96 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
97 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
98 *packed_w++ =
99 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
100 }
101 }
102 packed_w += (nr - nr_block_size) * kr;
103 }
104
105 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
106 const size_t kr_block_size = min(kc - kr_block_start, kr);
107 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
108 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
109 *packed_w++ =
110 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
111 }
112 packed_w += kr - kr_block_size;
113 }
114 packed_w += (nr - nr_block_size) * kr;
115 }
116 }
117 k += nc * kc;
118 if XNN_UNPREDICTABLE(b != NULL) {
119 b += nc;
120 }
121 } while (--g != 0);
122}
123
Marat Dukhanab582382020-07-06 13:32:08 -0700124void xnn_pack_q8_gemm_goi_w(
125 size_t g,
126 size_t nc,
127 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700128 size_t nr,
129 size_t kr,
Marat Dukhanbc08f312020-07-07 16:22:04 -0700130 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700131 const uint8_t* k,
132 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700133 void* packed_w,
134 const struct xnn_q8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700135{
Marat Dukhanbc08f312020-07-07 16:22:04 -0700136 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700137 const int32_t izp = (int32_t) params->input_zero_point;
138 const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700139 do {
140 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
141 const size_t nr_block_size = min(nc - nr_block_start, nr);
142 int32_t* packed_b = (int32_t*) packed_w;
143 if XNN_LIKELY(b != NULL) {
144 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
145 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
146 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
147 }
148 } else {
149 size_t n = nr_block_size;
150 do {
151 *((int32_t*) packed_w) = boff;
152 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
153 } while (--n != 0);
154 }
155 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
156 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
157 const size_t kr_block_size = min(kc - kr_block_start, kr);
158 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
159 int32_t ksum = 0;
160 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
161 const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
162 ksum += (int32_t) kv;
163 *((uint8_t*) packed_w) = kv;
164 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
165 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700166 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700167 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
168 }
169 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
170 }
171 }
172 k += nc * kc;
173 if XNN_UNPREDICTABLE(b != NULL) {
174 b += nc;
175 }
176 } while (--g != 0);
177}
178
Marat Dukhana6879bd2020-07-06 14:25:08 -0700179void xnn_pack_f32_gemm_io_w(
180 size_t nc,
181 size_t kc,
182 size_t nr,
183 size_t kr,
184 size_t sr,
185 const float* k,
186 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700187 float* packed_w,
188 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700189{
190 const size_t skr = sr * kr;
191 const size_t skc = round_down_po2(kc, skr);
192 const size_t sr_mask = (sr - 1) * kr;
193 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
194 const size_t nr_block_size = min(nc - nr_block_start, nr);
195 if XNN_LIKELY(b != NULL) {
196 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
197 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
198 }
199 }
200 packed_w += nr;
201
202 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
203 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
204 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
205 *packed_w++ =
206 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
207 }
208 }
209 packed_w += (nr - nr_block_size) * kr;
210 }
211
212 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
213 const size_t kr_block_size = min(kc - kr_block_start, kr);
214 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
215 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
216 *packed_w++ =
217 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
218 }
219 packed_w += kr - kr_block_size;
220 }
221 packed_w += (nr - nr_block_size) * kr;
222 }
223 }
224}
225
226void xnn_pack_f16_gemm_io_w(
227 size_t nc,
228 size_t kc,
229 size_t nr,
230 size_t kr,
231 size_t sr,
232 const uint16_t* k,
233 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700234 uint16_t* packed_w,
235 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700236{
237 const size_t skr = sr * kr;
238 const size_t skc = round_down_po2(kc, skr);
239 const size_t sr_mask = (sr - 1) * kr;
240 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
241 const size_t nr_block_size = min(nc - nr_block_start, nr);
242 if XNN_LIKELY(b != NULL) {
243 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
244 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
245 }
246 }
247 packed_w += nr;
248
249 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
250 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
251 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
252 *packed_w++ =
253 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
254 }
255 }
256 packed_w += (nr - nr_block_size) * kr;
257 }
258
259 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
260 const size_t kr_block_size = min(kc - kr_block_start, kr);
261 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
262 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
263 *packed_w++ =
264 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
265 }
266 packed_w += kr - kr_block_size;
267 }
268 packed_w += (nr - nr_block_size) * kr;
269 }
270 }
271}
272
Marat Dukhanab582382020-07-06 13:32:08 -0700273void xnn_pack_q8_gemm_io_w(
274 size_t nc,
275 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700276 size_t nr,
277 size_t kr,
Marat Dukhanbc08f312020-07-07 16:22:04 -0700278 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700279 const uint8_t* k,
280 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700281 void* packed_w,
282 const struct xnn_q8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700283{
Marat Dukhanbc08f312020-07-07 16:22:04 -0700284 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700285 const int32_t izp = (int32_t) params->input_zero_point;
286 const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700287 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
288 const size_t nr_block_size = min(nc - nr_block_start, nr);
289 int32_t* packed_b = (int32_t*) packed_w;
290 if XNN_LIKELY(b != NULL) {
291 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
292 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
293 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
294 }
295 } else {
296 size_t n = nr_block_size;
297 do {
298 *((int32_t*) packed_w) = boff;
299 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
300 } while (--n != 0);
301 }
302 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
303 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
304 const size_t kr_block_size = min(kc - kr_block_start, kr);
305 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
306 int32_t ksum = 0;
307 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
308 const uint8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
309 ksum += (int32_t) kv;
310 *((uint8_t*) packed_w) = kv;
311 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
312 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700313 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700314 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
315 }
316 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
317 }
318 }
319}
320
Marat Dukhana6879bd2020-07-06 14:25:08 -0700321void xnn_pack_f32_conv_goki_w(
322 size_t g,
323 size_t nc,
324 size_t ks,
325 size_t kc,
326 size_t nr,
327 size_t kr,
328 size_t sr,
329 const float* k,
330 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700331 float* packed_w,
332 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700333{
334 const size_t skr = sr * kr;
335 const size_t skc = round_down_po2(kc, skr);
336 const size_t sr_mask = (sr - 1) * kr;
337 do {
338 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
339 const size_t nr_block_size = min(nc - nr_block_start, nr);
340 if XNN_LIKELY(b != NULL) {
341 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
342 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
343 }
344 }
345 packed_w += nr;
346
347 for (size_t ki = 0; ki < ks; ki++) {
348 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
349 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
350 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
351 *packed_w++ =
352 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
353 }
354 }
355 packed_w += (nr - nr_block_size) * kr;
356 }
357
358 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
359 const size_t kr_block_size = min(kc - kr_block_start, kr);
360 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
361 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
362 *packed_w++ =
363 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
364 }
365 packed_w += kr - kr_block_size;
366 }
367 packed_w += (nr - nr_block_size) * kr;
368 }
369 }
370 }
371 k += ks * kc * nc;
372 if XNN_UNPREDICTABLE(b != NULL) {
373 b += nc;
374 }
375 } while (--g != 0);
376}
377
378void xnn_pack_f16_conv_goki_w(
379 size_t g,
380 size_t nc,
381 size_t ks,
382 size_t kc,
383 size_t nr,
384 size_t kr,
385 size_t sr,
386 const uint16_t* k,
387 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700388 uint16_t* packed_w,
389 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700390{
391 const size_t skr = sr * kr;
392 const size_t skc = round_down_po2(kc, skr);
393 const size_t sr_mask = (sr - 1) * kr;
394 do {
395 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
396 const size_t nr_block_size = min(nc - nr_block_start, nr);
397 if XNN_LIKELY(b != NULL) {
398 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
399 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
400 }
401 }
402 packed_w += nr;
403
404 for (size_t ki = 0; ki < ks; ki++) {
405 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
406 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
407 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
408 *packed_w++ =
409 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
410 }
411 }
412 packed_w += (nr - nr_block_size) * kr;
413 }
414
415 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
416 const size_t kr_block_size = min(kc - kr_block_start, kr);
417 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
418 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
419 *packed_w++ =
420 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
421 }
422 packed_w += kr - kr_block_size;
423 }
424 packed_w += (nr - nr_block_size) * kr;
425 }
426 }
427 }
428 k += ks * kc * nc;
429 if XNN_UNPREDICTABLE(b != NULL) {
430 b += nc;
431 }
432 } while (--g != 0);
433}
434
Marat Dukhanab582382020-07-06 13:32:08 -0700435void xnn_pack_q8_conv_goki_w(
436 size_t g,
437 size_t nc,
438 size_t ks,
439 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700440 size_t nr,
441 size_t kr,
Marat Dukhanab582382020-07-06 13:32:08 -0700442 const uint8_t* k,
443 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700444 void* packed_w,
445 const struct xnn_q8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700446{
Marat Dukhanb42f8662020-07-06 20:46:13 -0700447 const int32_t izp = (int32_t) params->input_zero_point;
448 const int32_t boff = (int32_t) ks * (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700449 do {
450 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
451 const size_t nr_block_size = min(nc - nr_block_start, nr);
452 int32_t* packed_b = (int32_t*) packed_w;
453 if XNN_LIKELY(b != NULL) {
454 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
455 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
456 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
457 }
458 } else {
459 size_t n = nr_block_size;
460 do {
461 *((int32_t*) packed_w) = boff;
462 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
463 } while (--n != 0);
464 }
465 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
466 for (size_t ki = 0; ki < ks; ki++) {
467 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
468 const size_t kr_block_size = min(kc - kr_block_start, kr);
469 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
470 int32_t ksum = 0;
471 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
472 const uint8_t kv =
473 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
474 ksum += (int32_t) kv;
475 *((uint8_t*) packed_w) = kv;
476 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
477 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700478 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700479 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
480 }
481 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
482 }
483 }
484 }
485 k += ks * kc * nc;
486 if XNN_UNPREDICTABLE(b != NULL) {
487 b += nc;
488 }
489 } while (--g != 0);
490}
491
Marat Dukhana6879bd2020-07-06 14:25:08 -0700492void xnn_pack_f32_conv_kgo_w(
493 size_t g,
494 size_t nc,
495 size_t ks,
496 size_t nr,
497 size_t kr,
498 const float* k,
499 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700500 float* packed_w,
501 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700502{
503 for (size_t i = 0; i < g; i++) {
504 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
505 const size_t nr_block_size = min(nc - nr_block_start, nr);
506 if XNN_LIKELY(b != NULL) {
507 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
508 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
509 }
510 }
511 packed_w += nr;
512 for (size_t ki = 0; ki < ks; ki++) {
513 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
514 *packed_w =
515 k[ki * g * nc + (nr_block_start + nr_block_offset)];
516 packed_w += kr;
517 }
518 packed_w += (nr - nr_block_size) * kr;
519 }
520 }
521 k += nc;
522 if XNN_UNPREDICTABLE(b != NULL) {
523 b += nc;
524 }
525 }
526}
527
528void xnn_pack_f16_conv_kgo_w(
529 size_t g,
530 size_t nc,
531 size_t ks,
532 size_t nr,
533 size_t kr,
534 const uint16_t* k,
535 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700536 uint16_t* packed_w,
537 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700538{
539 for (size_t i = 0; i < g; i++) {
540 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
541 const size_t nr_block_size = min(nc - nr_block_start, nr);
542 if XNN_LIKELY(b != NULL) {
543 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
544 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
545 }
546 }
547 packed_w += nr;
548 for (size_t ki = 0; ki < ks; ki++) {
549 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
550 *packed_w =
551 k[ki * g * nc + (nr_block_start + nr_block_offset)];
552 packed_w += kr;
553 }
554 packed_w += (nr - nr_block_size) * kr;
555 }
556 }
557 k += nc;
558 if XNN_UNPREDICTABLE(b != NULL) {
559 b += nc;
560 }
561 }
562}
563
Marat Dukhanab582382020-07-06 13:32:08 -0700564void xnn_pack_q8_conv_kgo_w(
565 size_t g,
566 size_t nc,
567 size_t ks,
Marat Dukhana6879bd2020-07-06 14:25:08 -0700568 size_t nr,
569 size_t kr,
Marat Dukhanab582382020-07-06 13:32:08 -0700570 const uint8_t* k,
571 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700572 void* packed_w,
573 const struct xnn_q8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700574{
Marat Dukhanb42f8662020-07-06 20:46:13 -0700575 const int32_t izp = (int32_t) params->input_zero_point;
576 const int32_t boff = (int32_t) ks * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700577 for (size_t i = 0; i < g; i++) {
578 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
579 const size_t nr_block_size = min(nc - nr_block_start, nr);
580 int32_t* packed_b = (int32_t*) packed_w;
581 if XNN_LIKELY(b != NULL) {
582 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
583 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
584 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
585 }
586 } else {
587 size_t n = nr_block_size;
588 do {
589 *((int32_t*) packed_w) = boff;
590 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
591 } while (--n != 0);
592 }
593 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
594 for (size_t ki = 0; ki < ks; ki++) {
595 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
596 const uint8_t kv =
597 k[ki * g * nc + (nr_block_start + nr_block_offset)];
598 *((uint8_t*) packed_w) = kv;
Marat Dukhanb42f8662020-07-06 20:46:13 -0700599 packed_b[nr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700600 packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(uint8_t));
601 }
602 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
603 }
604 }
605 k += nc;
606 if XNN_UNPREDICTABLE(b != NULL) {
607 b += nc;
608 }
609 }
610}
611
Marat Dukhana6879bd2020-07-06 14:25:08 -0700612void xnn_pack_f32_deconv_goki_w(
613 size_t g,
614 size_t nc,
615 size_t kh,
616 size_t kw,
617 size_t kc,
618 size_t sh,
619 size_t sw,
620 size_t nr,
621 size_t kr,
622 size_t sr,
623 const float* k,
624 const float* b,
625 float* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700626 struct subconvolution_params* subconv_params,
627 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700628{
629 const size_t skr = sr * kr;
630 const size_t skc = round_down_po2(kc, skr);
631 const size_t sr_mask = (sr - 1) * kr;
632 for (size_t i = 0; i < g; i++) {
633 for (size_t oy = 0; oy < sh; oy++) {
634 for (size_t ox = 0; ox < sw; ox++) {
635 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -0700636 (*subconv_params++).weights = packed_w;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700637 }
638 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
639 const size_t nr_block_size = min(nc - nr_block_start, nr);
640 if XNN_LIKELY(b != NULL) {
641 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
642 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
643 }
644 }
645 packed_w += nr;
646 for (size_t ky = oy; ky < kh; ky += sh) {
647 for (size_t kx = ox; kx < kw; kx += sw) {
648 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
649 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
650 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
651 *packed_w++ =
652 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
653 }
654 }
655 packed_w += (nr - nr_block_size) * kr;
656 }
657
658 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
659 const size_t kr_block_size = min(kc - kr_block_start, kr);
660 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
661 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
662 *packed_w++ =
663 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
664 }
665 packed_w += kr - kr_block_size;
666 }
667 packed_w += (nr - nr_block_size) * kr;
668 }
669 }
670 }
671 }
672 }
673 }
674 k += kh * kw * kc * nc;
675 if XNN_UNPREDICTABLE(b != NULL) {
676 b += nc;
677 }
678 }
679}
680
681void xnn_pack_f16_deconv_goki_w(
682 size_t g,
683 size_t nc,
684 size_t kh,
685 size_t kw,
686 size_t kc,
687 size_t sh,
688 size_t sw,
689 size_t nr,
690 size_t kr,
691 size_t sr,
692 const uint16_t* k,
693 const uint16_t* b,
694 uint16_t* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700695 struct subconvolution_params* subconv_params,
696 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700697{
698 const size_t skr = sr * kr;
699 const size_t skc = round_down_po2(kc, skr);
700 const size_t sr_mask = (sr - 1) * kr;
701 for (size_t i = 0; i < g; i++) {
702 for (size_t oy = 0; oy < sh; oy++) {
703 for (size_t ox = 0; ox < sw; ox++) {
704 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -0700705 (*subconv_params++).weights = packed_w;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700706 }
707 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
708 const size_t nr_block_size = min(nc - nr_block_start, nr);
709 if XNN_LIKELY(b != NULL) {
710 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
711 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
712 }
713 }
714 packed_w += nr;
715 for (size_t ky = oy; ky < kh; ky += sh) {
716 for (size_t kx = ox; kx < kw; kx += sw) {
717 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
718 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
719 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
720 *packed_w++ =
721 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
722 }
723 }
724 packed_w += (nr - nr_block_size) * kr;
725 }
726
727 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
728 const size_t kr_block_size = min(kc - kr_block_start, kr);
729 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
730 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
731 *packed_w++ =
732 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
733 }
734 packed_w += kr - kr_block_size;
735 }
736 packed_w += (nr - nr_block_size) * kr;
737 }
738 }
739 }
740 }
741 }
742 }
743 k += kh * kw * kc * nc;
744 if XNN_UNPREDICTABLE(b != NULL) {
745 b += nc;
746 }
747 }
748}
749
Marat Dukhanab582382020-07-06 13:32:08 -0700750void xnn_pack_q8_deconv_goki_w(
751 size_t g,
752 size_t nc,
753 size_t kh,
754 size_t kw,
755 size_t kc,
756 size_t sh,
757 size_t sw,
758 size_t nr,
759 size_t kr,
Marat Dukhanab582382020-07-06 13:32:08 -0700760 const uint8_t* k,
761 const int32_t* b,
762 void* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700763 struct subconvolution_params* subconv_params,
764 const struct xnn_q8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700765{
Marat Dukhanb42f8662020-07-06 20:46:13 -0700766 const int32_t izp = (int32_t) params->input_zero_point;
767 const int32_t kzp = (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700768 for (size_t i = 0; i < g; i++) {
769 for (size_t oy = 0; oy < sh; oy++) {
770 for (size_t ox = 0; ox < sw; ox++) {
771 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -0700772 (*subconv_params++).weights = packed_w;
Marat Dukhanab582382020-07-06 13:32:08 -0700773 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700774 const int32_t boff = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * izp * kzp;
Marat Dukhanab582382020-07-06 13:32:08 -0700775 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
776 const size_t nr_block_size = min(nc - nr_block_start, nr);
777 int32_t* packed_b = (int32_t*) packed_w;
778 if XNN_LIKELY(b != 0) {
779 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
780 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
781 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
782 }
783 } else {
784 size_t n = nr_block_size;
785 do {
786 *((int32_t*) packed_w) = boff;
787 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
788 } while (--n != 0);
789 }
790 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
791 for (size_t ky = oy; ky < kh; ky += sh) {
792 for (size_t kx = ox; kx < kw; kx += sw) {
793 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
794 const size_t kr_block_size = min(kc - kr_block_start, kr);
795 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
796 int32_t ksum = 0;
797 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
798 const uint8_t kv =
799 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
800 ksum += (int32_t) kv;
801 *((uint8_t*) packed_w) = kv;
802 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
803 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700804 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700805 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
806 }
807 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
808 }
809 }
810 }
811 }
812 }
813 }
814 k += kh * kw * kc * nc;
815 if XNN_UNPREDICTABLE(b != NULL) {
816 b += nc;
817 }
818 }
819}
820
Marat Dukhana6879bd2020-07-06 14:25:08 -0700821void xnn_pack_f32_dwconv_ghw_w(
822 size_t h,
823 size_t w,
824 size_t c,
825 size_t cr,
826 const float* k,
827 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700828 float* packed_w,
829 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700830{
831 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
832 const size_t cr_block_size = min(c - cr_block_start, cr);
833 if XNN_LIKELY(b != NULL) {
834 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
835 *packed_w++ = b[cr_block_start + cr_block_offset];
836 }
837 } else {
838 size_t n = cr_block_size;
839 do {
840 *packed_w++ = 0.0f;
841 } while (--n != 0);
842 }
843 packed_w += cr - cr_block_size;
844 for (size_t x = 0; x < w; x++) {
845 for (size_t y = 0; y < h; y++) {
846 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
847 const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
848 *packed_w++ = kv;
849 }
850 packed_w += cr - cr_block_size;
851 }
852 }
853 }
854}
855
856void xnn_pack_f16_dwconv_ghw_w(
857 size_t h,
858 size_t w,
859 size_t c,
860 size_t cr,
861 const uint16_t* k,
862 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700863 uint16_t* packed_w,
864 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700865{
866 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
867 const size_t cr_block_size = min(c - cr_block_start, cr);
868 if XNN_LIKELY(b != NULL) {
869 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
870 *packed_w++ = b[cr_block_start + cr_block_offset];
871 }
872 } else {
873 size_t n = cr_block_size;
874 do {
875 *packed_w++ = 0;
876 } while (--n != 0);
877 }
878 packed_w += cr - cr_block_size;
879 for (size_t x = 0; x < w; x++) {
880 for (size_t y = 0; y < h; y++) {
881 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
882 const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
883 *packed_w++ = kv;
884 }
885 packed_w += cr - cr_block_size;
886 }
887 }
888 }
889}
890
Marat Dukhanab582382020-07-06 13:32:08 -0700891void xnn_pack_q8_dwconv_ghw_w(
892 size_t h,
893 size_t w,
894 size_t c,
895 size_t cr,
Marat Dukhanab582382020-07-06 13:32:08 -0700896 const uint8_t* k,
897 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700898 void* packed_w,
899 const struct xnn_q8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700900{
Marat Dukhanb42f8662020-07-06 20:46:13 -0700901 const int32_t izp = (int32_t) params->input_zero_point;
902 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700903 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
904 const size_t cr_block_size = min(c - cr_block_start, cr);
905 int32_t* packed_b = (int32_t*) packed_w;
906 if XNN_LIKELY(b != NULL) {
907 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
908 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
909 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
910 }
911 } else {
912 size_t n = cr_block_size;
913 do {
914 *((int32_t*) packed_w) = boff;
915 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
916 } while (--n != 0);
917 }
918 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
919 for (size_t x = 0; x < w; x++) {
920 for (size_t y = 0; y < h; y++) {
921 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
922 const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
Marat Dukhanb42f8662020-07-06 20:46:13 -0700923 packed_b[cr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700924 *((uint8_t*) packed_w) = kv;
925 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
926 }
927 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
928 }
929 }
930 }
931}
932
Marat Dukhana6879bd2020-07-06 14:25:08 -0700933void xnn_pack_f32_dwconv_hwg_w(
934 size_t h,
935 size_t w,
936 size_t c,
937 size_t cr,
938 const float* k,
939 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700940 float* packed_w,
941 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700942{
943 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
944 const size_t cr_block_size = min(c - cr_block_start, cr);
945 if XNN_LIKELY(b != NULL) {
946 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
947 *packed_w++ = b[cr_block_start + cr_block_offset];
948 }
949 } else {
950 size_t n = cr_block_size;
951 do {
952 *packed_w++ = 0.0f;
953 } while (--n != 0);
954 }
955 packed_w += cr - cr_block_size;
956 for (size_t x = 0; x < w; x++) {
957 for (size_t y = 0; y < h; y++) {
958 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
959 const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
960 *packed_w++ = kv;
961 }
962 packed_w += cr - cr_block_size;
963 }
964 }
965 }
966}
967
968void xnn_pack_f16_dwconv_hwg_w(
969 size_t h,
970 size_t w,
971 size_t c,
972 size_t cr,
973 const uint16_t* k,
974 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700975 uint16_t* packed_w,
976 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700977{
978 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
979 const size_t cr_block_size = min(c - cr_block_start, cr);
980 if XNN_LIKELY(b != NULL) {
981 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
982 *packed_w++ = b[cr_block_start + cr_block_offset];
983 }
984 } else {
985 size_t n = cr_block_size;
986 do {
987 *packed_w++ = 0;
988 } while (--n != 0);
989 }
990 packed_w += cr - cr_block_size;
991 for (size_t x = 0; x < w; x++) {
992 for (size_t y = 0; y < h; y++) {
993 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
994 const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
995 *packed_w++ = kv;
996 }
997 packed_w += cr - cr_block_size;
998 }
999 }
1000 }
1001}
1002
Marat Dukhanab582382020-07-06 13:32:08 -07001003void xnn_pack_q8_dwconv_hwg_w(
1004 size_t h,
1005 size_t w,
1006 size_t c,
1007 size_t cr,
Marat Dukhanab582382020-07-06 13:32:08 -07001008 const uint8_t* k,
1009 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001010 void* packed_w,
1011 const struct xnn_q8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001012{
Marat Dukhanb42f8662020-07-06 20:46:13 -07001013 const int32_t izp = (int32_t) params->input_zero_point;
1014 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001015 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1016 const size_t cr_block_size = min(c - cr_block_start, cr);
1017 int32_t* packed_b = (int32_t*) packed_w;
1018 if XNN_LIKELY(b != NULL) {
1019 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1020 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1021 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1022 }
1023 } else {
1024 size_t n = cr_block_size;
1025 do {
1026 *((int32_t*) packed_w) = boff;
1027 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1028 } while (--n != 0);
1029 }
1030 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1031 for (size_t x = 0; x < w; x++) {
1032 for (size_t y = 0; y < h; y++) {
1033 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1034 const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
Marat Dukhanb42f8662020-07-06 20:46:13 -07001035 packed_b[cr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001036 *((uint8_t*) packed_w) = kv;
1037 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1038 }
1039 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1040 }
1041 }
1042 }
1043}
1044
Marat Dukhana6879bd2020-07-06 14:25:08 -07001045void xnn_pack_f32_gemminc_goi_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001046 size_t g,
1047 size_t nc,
1048 size_t kc,
1049 size_t nr,
1050 size_t kr,
1051 size_t sr,
Marat Dukhana6879bd2020-07-06 14:25:08 -07001052 const float* k,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001053 float* packed_w,
1054 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001055{
1056 const size_t skr = sr * kr;
1057 const size_t skc = round_down_po2(kc, skr);
1058 const size_t sr_mask = (sr - 1) * kr;
1059 do {
1060 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1061 const size_t nr_block_size = min(nc - nr_block_start, nr);
Marat Dukhanab582382020-07-06 13:32:08 -07001062
1063 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1064 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1065 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1066 *packed_w++ =
1067 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1068 }
1069 }
1070 packed_w += (nr - nr_block_size) * kr;
1071 }
1072
1073 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1074 const size_t kr_block_size = min(kc - kr_block_start, kr);
1075 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1076 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1077 *packed_w++ =
1078 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1079 }
1080 packed_w += kr - kr_block_size;
1081 }
1082 packed_w += (nr - nr_block_size) * kr;
1083 }
1084 }
1085 k += nc * kc;
Marat Dukhanab582382020-07-06 13:32:08 -07001086 } while (--g != 0);
1087}
1088
Marat Dukhanab582382020-07-06 13:32:08 -07001089void xnn_pack_f16_gemminc_goi_w(
1090 size_t g,
1091 size_t nc,
1092 size_t kc,
1093 size_t nr,
1094 size_t kr,
1095 size_t sr,
1096 const uint16_t* k,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001097 uint16_t* packed_w,
1098 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001099{
1100 const size_t skr = sr * kr;
1101 const size_t skc = round_down_po2(kc, skr);
1102 const size_t sr_mask = (sr - 1) * kr;
1103 do {
1104 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1105 const size_t nr_block_size = min(nc - nr_block_start, nr);
1106
1107 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1108 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1109 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1110 *packed_w++ =
1111 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1112 }
1113 }
1114 packed_w += (nr - nr_block_size) * kr;
1115 }
1116
1117 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1118 const size_t kr_block_size = min(kc - kr_block_start, kr);
1119 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1120 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1121 *packed_w++ =
1122 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1123 }
1124 packed_w += kr - kr_block_size;
1125 }
1126 packed_w += (nr - nr_block_size) * kr;
1127 }
1128 }
1129 k += nc * kc;
1130 } while (--g != 0);
1131}
1132
Marat Dukhana6879bd2020-07-06 14:25:08 -07001133void xnn_pack_f32_dconv_oki_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001134 size_t nc,
Marat Dukhanab582382020-07-06 13:32:08 -07001135 size_t kc,
1136 size_t nr,
Marat Dukhana6879bd2020-07-06 14:25:08 -07001137 size_t kh,
1138 size_t kw,
1139 const float* k,
1140 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001141 float* packed_w,
1142 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001143{
Marat Dukhana6879bd2020-07-06 14:25:08 -07001144 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1145 const size_t nr_block_size = min(nc - nr_block_start, nr);
1146 if XNN_LIKELY(b != NULL) {
1147 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1148 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
Marat Dukhanab582382020-07-06 13:32:08 -07001149 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001150 } else {
1151 size_t n = nr;
1152 do {
1153 *packed_w++ = 0.0f;
1154 } while (--n != 0);
1155 }
Marat Dukhanab582382020-07-06 13:32:08 -07001156
Marat Dukhana6879bd2020-07-06 14:25:08 -07001157 for (size_t kx = 0; kx < kw; kx++) {
1158 for (size_t c = 0; c < kc; c++) {
1159 for (size_t ky = 0; ky < kh; ky++) {
1160 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1161 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
Marat Dukhanab582382020-07-06 13:32:08 -07001162 }
Marat Dukhanab582382020-07-06 13:32:08 -07001163 }
1164 }
1165 }
Marat Dukhanab582382020-07-06 13:32:08 -07001166 if XNN_UNPREDICTABLE(b != NULL) {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001167 b += nr;
Marat Dukhanab582382020-07-06 13:32:08 -07001168 }
1169 }
1170}
1171
1172void xnn_pack_f16_dconv_oki_w(
1173 size_t nc,
1174 size_t kc,
1175 size_t nr,
1176 size_t kh,
1177 size_t kw,
1178 const uint16_t* k,
1179 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001180 uint16_t* packed_w,
1181 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001182{
1183 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1184 const size_t nr_block_size = min(nc - nr_block_start, nr);
1185 if XNN_LIKELY(b != NULL) {
1186 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1187 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1188 }
1189 } else {
1190 size_t n = nr;
1191 do {
1192 *packed_w++ = 0;
1193 } while (--n != 0);
1194 }
1195
1196 for (size_t kx = 0; kx < kw; kx++) {
1197 for (size_t c = 0; c < kc; c++) {
1198 for (size_t ky = 0; ky < kh; ky++) {
1199 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1200 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1201 }
1202 }
1203 }
1204 }
1205 if XNN_UNPREDICTABLE(b != NULL) {
1206 b += nr;
1207 }
1208 }
1209}
1210
Marat Dukhana6879bd2020-07-06 14:25:08 -07001211void xnn_pack_f32_chw_dwconv_ghw_w(
1212 size_t kernel_size,
1213 size_t groups,
1214 const float* kernel,
1215 const float* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001216 float* packed_weights,
1217 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001218{
Marat Dukhana6879bd2020-07-06 14:25:08 -07001219 for (size_t g = 0; g < groups; g++) {
1220 if XNN_LIKELY(bias != NULL) {
1221 *packed_weights = *bias++;
Marat Dukhanab582382020-07-06 13:32:08 -07001222 } else {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001223 *packed_weights = 0.0f;
Marat Dukhanab582382020-07-06 13:32:08 -07001224 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001225 packed_weights += 1;
1226 for (size_t i = 0; i < kernel_size; i++) {
1227 *packed_weights++ = kernel[g * kernel_size + i];
Marat Dukhanab582382020-07-06 13:32:08 -07001228 }
1229 }
1230}
1231
1232void xnn_pack_f16_chw_dwconv_ghw_w(
1233 size_t kernel_size,
1234 size_t groups,
1235 const uint16_t* kernel,
1236 const uint16_t* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001237 uint16_t* packed_weights,
1238 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001239{
1240 for (size_t g = 0; g < groups; g++) {
1241 if XNN_LIKELY(bias != NULL) {
1242 *packed_weights = *bias++;
1243 } else {
1244 *packed_weights = 0;
1245 }
1246 packed_weights += 1;
1247 for (size_t i = 0; i < kernel_size; i++) {
1248 *packed_weights++ = kernel[g * kernel_size + i];
1249 }
1250 }
1251}
1252
Marat Dukhanab582382020-07-06 13:32:08 -07001253void xnn_pack_f32_chw_dwconv_hwg_w(
1254 size_t kernel_size,
1255 size_t groups,
1256 const float* kernel,
1257 const float* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001258 float* packed_weights,
1259 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001260{
1261 for (size_t g = 0; g < groups; g++) {
1262 if XNN_LIKELY(bias != NULL) {
1263 *packed_weights = *bias++;
1264 } else {
1265 *packed_weights = 0.0f;
1266 }
1267 packed_weights += 1;
1268 for (size_t i = 0; i < kernel_size; i++) {
1269 *packed_weights++ = kernel[i * groups + g];
1270 }
1271 }
1272}
1273
1274void xnn_pack_f32_vmulcaddc_w(
1275 size_t c,
1276 size_t cr,
1277 const float* s,
1278 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001279 float* packed_w,
1280 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001281{
1282 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1283 const size_t cr_block_size = min(c - cr_block_start, cr);
1284 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1285 *packed_w++ = s[cr_block_start + cr_block_offset];
1286 }
1287 packed_w += cr - cr_block_size;
1288 if XNN_LIKELY(b != NULL) {
1289 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1290 *packed_w++ = b[cr_block_start + cr_block_offset];
1291 }
1292 } else {
1293 size_t n = cr_block_size;
1294 do {
1295 *packed_w++ = 0.0f;
1296 } while (--n != 0);
1297 }
1298 packed_w += cr - cr_block_size;
1299 }
1300}
1301
1302void xnn_pack_f16_vmulcaddc_w(
1303 size_t c,
1304 size_t cr,
1305 const uint16_t* s,
1306 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001307 uint16_t* packed_w,
1308 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001309{
1310 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1311 const size_t cr_block_size = min(c - cr_block_start, cr);
1312 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1313 *packed_w++ = s[cr_block_start + cr_block_offset];
1314 }
1315 packed_w += cr - cr_block_size;
1316 if XNN_LIKELY(b != NULL) {
1317 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1318 *packed_w++ = b[cr_block_start + cr_block_offset];
1319 }
1320 } else {
1321 size_t n = cr_block_size;
1322 do {
1323 *packed_w++ = 0;
1324 } while (--n != 0);
1325 }
1326 packed_w += cr - cr_block_size;
1327 }
1328}