blob: 5f52e2ecd6833ff08191c0c1ff0cf3d9d4f3d816 [file] [log] [blame]
Marat Dukhanab582382020-07-06 13:32:08 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <stdint.h>
10#include <stddef.h>
11
12#include <xnnpack/math.h>
13#include <xnnpack/pack.h>
14
15
Marat Dukhana6879bd2020-07-06 14:25:08 -070016void xnn_pack_f32_gemm_goi_w(
17 size_t g,
18 size_t nc,
19 size_t kc,
20 size_t nr,
21 size_t kr,
22 size_t sr,
23 const float* k,
24 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -070025 float* packed_w,
26 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -070027{
28 const size_t skr = sr * kr;
29 const size_t skc = round_down_po2(kc, skr);
30 const size_t sr_mask = (sr - 1) * kr;
31 do {
32 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
33 const size_t nr_block_size = min(nc - nr_block_start, nr);
34 if XNN_LIKELY(b != NULL) {
35 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
36 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
37 }
38 }
39 packed_w += nr;
40
41 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
42 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
43 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
44 *packed_w++ =
45 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
46 }
47 }
48 packed_w += (nr - nr_block_size) * kr;
49 }
50
51 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
52 const size_t kr_block_size = min(kc - kr_block_start, kr);
53 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
54 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
55 *packed_w++ =
56 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
57 }
58 packed_w += kr - kr_block_size;
59 }
60 packed_w += (nr - nr_block_size) * kr;
61 }
62 }
63 k += nc * kc;
64 if XNN_UNPREDICTABLE(b != NULL) {
65 b += nc;
66 }
67 } while (--g != 0);
68}
69
70void xnn_pack_f16_gemm_goi_w(
71 size_t g,
72 size_t nc,
73 size_t kc,
74 size_t nr,
75 size_t kr,
76 size_t sr,
77 const uint16_t* k,
78 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -070079 uint16_t* packed_w,
80 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -070081{
82 const size_t skr = sr * kr;
83 const size_t skc = round_down_po2(kc, skr);
84 const size_t sr_mask = (sr - 1) * kr;
85 do {
86 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
87 const size_t nr_block_size = min(nc - nr_block_start, nr);
88 if XNN_LIKELY(b != NULL) {
89 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
90 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
91 }
92 }
93 packed_w += nr;
94
95 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
96 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
97 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
98 *packed_w++ =
99 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
100 }
101 }
102 packed_w += (nr - nr_block_size) * kr;
103 }
104
105 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
106 const size_t kr_block_size = min(kc - kr_block_start, kr);
107 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
108 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
109 *packed_w++ =
110 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
111 }
112 packed_w += kr - kr_block_size;
113 }
114 packed_w += (nr - nr_block_size) * kr;
115 }
116 }
117 k += nc * kc;
118 if XNN_UNPREDICTABLE(b != NULL) {
119 b += nc;
120 }
121 } while (--g != 0);
122}
123
Marat Dukhan08b7a972020-07-14 18:17:29 -0700124void xnn_pack_qu8_gemm_goi_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700125 size_t g,
126 size_t nc,
127 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700128 size_t nr,
129 size_t kr,
Marat Dukhanbc08f312020-07-07 16:22:04 -0700130 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700131 const uint8_t* k,
132 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700133 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700134 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700135{
Marat Dukhanbc08f312020-07-07 16:22:04 -0700136 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700137 const int32_t izp = (int32_t) params->input_zero_point;
138 const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700139 do {
140 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
141 const size_t nr_block_size = min(nc - nr_block_start, nr);
142 int32_t* packed_b = (int32_t*) packed_w;
143 if XNN_LIKELY(b != NULL) {
144 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
145 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
146 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
147 }
148 } else {
149 size_t n = nr_block_size;
150 do {
151 *((int32_t*) packed_w) = boff;
152 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
153 } while (--n != 0);
154 }
155 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
156 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
157 const size_t kr_block_size = min(kc - kr_block_start, kr);
158 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
159 int32_t ksum = 0;
160 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
161 const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
162 ksum += (int32_t) kv;
163 *((uint8_t*) packed_w) = kv;
164 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
165 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700166 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700167 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
168 }
169 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
170 }
171 }
172 k += nc * kc;
173 if XNN_UNPREDICTABLE(b != NULL) {
174 b += nc;
175 }
176 } while (--g != 0);
177}
178
Marat Dukhan595e1702020-07-31 10:12:52 -0700179void xnn_pack_qs8_gemm_goi_w(
180 size_t g,
181 size_t nc,
182 size_t kc,
183 size_t nr,
184 size_t kr,
185 size_t sr,
186 const int8_t* k,
187 const int32_t* b,
188 void* packed_w,
189 const struct xnn_qs8_packing_params* params)
190{
191 assert(sr == 1);
192 const int32_t izp = (int32_t) params->input_zero_point;
193 do {
194 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
195 const size_t nr_block_size = min(nc - nr_block_start, nr);
196 int32_t* packed_b = (int32_t*) packed_w;
197 if XNN_LIKELY(b != NULL) {
198 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
199 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
200 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
201 }
202 } else {
203 size_t n = nr_block_size;
204 do {
205 *((int32_t*) packed_w) = 0;
206 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
207 } while (--n != 0);
208 }
209 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
210 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
211 const size_t kr_block_size = min(kc - kr_block_start, kr);
212 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
213 int32_t ksum = 0;
214 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
215 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
216 ksum += (int32_t) kv;
217 *((int8_t*) packed_w) = kv;
218 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
219 }
220 packed_b[nr_block_offset] -= ksum * izp;
221 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
222 }
223 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
224 }
225 }
226 k += nc * kc;
227 if XNN_UNPREDICTABLE(b != NULL) {
228 b += nc;
229 }
230 } while (--g != 0);
231}
232
Marat Dukhana6879bd2020-07-06 14:25:08 -0700233void xnn_pack_f32_gemm_io_w(
234 size_t nc,
235 size_t kc,
236 size_t nr,
237 size_t kr,
238 size_t sr,
239 const float* k,
240 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700241 float* packed_w,
242 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700243{
244 const size_t skr = sr * kr;
245 const size_t skc = round_down_po2(kc, skr);
246 const size_t sr_mask = (sr - 1) * kr;
247 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
248 const size_t nr_block_size = min(nc - nr_block_start, nr);
249 if XNN_LIKELY(b != NULL) {
250 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
251 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
252 }
253 }
254 packed_w += nr;
255
256 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
257 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
258 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
259 *packed_w++ =
260 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
261 }
262 }
263 packed_w += (nr - nr_block_size) * kr;
264 }
265
266 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
267 const size_t kr_block_size = min(kc - kr_block_start, kr);
268 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
269 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
270 *packed_w++ =
271 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
272 }
273 packed_w += kr - kr_block_size;
274 }
275 packed_w += (nr - nr_block_size) * kr;
276 }
277 }
278}
279
280void xnn_pack_f16_gemm_io_w(
281 size_t nc,
282 size_t kc,
283 size_t nr,
284 size_t kr,
285 size_t sr,
286 const uint16_t* k,
287 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700288 uint16_t* packed_w,
289 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700290{
291 const size_t skr = sr * kr;
292 const size_t skc = round_down_po2(kc, skr);
293 const size_t sr_mask = (sr - 1) * kr;
294 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
295 const size_t nr_block_size = min(nc - nr_block_start, nr);
296 if XNN_LIKELY(b != NULL) {
297 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
298 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
299 }
300 }
301 packed_w += nr;
302
303 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
304 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
305 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
306 *packed_w++ =
307 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
308 }
309 }
310 packed_w += (nr - nr_block_size) * kr;
311 }
312
313 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
314 const size_t kr_block_size = min(kc - kr_block_start, kr);
315 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
316 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
317 *packed_w++ =
318 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
319 }
320 packed_w += kr - kr_block_size;
321 }
322 packed_w += (nr - nr_block_size) * kr;
323 }
324 }
325}
326
Marat Dukhan08b7a972020-07-14 18:17:29 -0700327void xnn_pack_qu8_gemm_io_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700328 size_t nc,
329 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700330 size_t nr,
331 size_t kr,
Marat Dukhanbc08f312020-07-07 16:22:04 -0700332 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700333 const uint8_t* k,
334 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700335 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700336 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700337{
Marat Dukhanbc08f312020-07-07 16:22:04 -0700338 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700339 const int32_t izp = (int32_t) params->input_zero_point;
340 const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700341 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
342 const size_t nr_block_size = min(nc - nr_block_start, nr);
343 int32_t* packed_b = (int32_t*) packed_w;
344 if XNN_LIKELY(b != NULL) {
345 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
346 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
347 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
348 }
349 } else {
350 size_t n = nr_block_size;
351 do {
352 *((int32_t*) packed_w) = boff;
353 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
354 } while (--n != 0);
355 }
356 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
357 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
358 const size_t kr_block_size = min(kc - kr_block_start, kr);
359 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
360 int32_t ksum = 0;
361 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
362 const uint8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
363 ksum += (int32_t) kv;
364 *((uint8_t*) packed_w) = kv;
365 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
366 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700367 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700368 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
369 }
370 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
371 }
372 }
373}
374
Marat Dukhana6879bd2020-07-06 14:25:08 -0700375void xnn_pack_f32_conv_goki_w(
376 size_t g,
377 size_t nc,
378 size_t ks,
379 size_t kc,
380 size_t nr,
381 size_t kr,
382 size_t sr,
383 const float* k,
384 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700385 float* packed_w,
386 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700387{
388 const size_t skr = sr * kr;
389 const size_t skc = round_down_po2(kc, skr);
390 const size_t sr_mask = (sr - 1) * kr;
391 do {
392 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
393 const size_t nr_block_size = min(nc - nr_block_start, nr);
394 if XNN_LIKELY(b != NULL) {
395 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
396 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
397 }
398 }
399 packed_w += nr;
400
401 for (size_t ki = 0; ki < ks; ki++) {
402 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
403 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
404 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
405 *packed_w++ =
406 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
407 }
408 }
409 packed_w += (nr - nr_block_size) * kr;
410 }
411
412 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
413 const size_t kr_block_size = min(kc - kr_block_start, kr);
414 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
415 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
416 *packed_w++ =
417 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
418 }
419 packed_w += kr - kr_block_size;
420 }
421 packed_w += (nr - nr_block_size) * kr;
422 }
423 }
424 }
425 k += ks * kc * nc;
426 if XNN_UNPREDICTABLE(b != NULL) {
427 b += nc;
428 }
429 } while (--g != 0);
430}
431
432void xnn_pack_f16_conv_goki_w(
433 size_t g,
434 size_t nc,
435 size_t ks,
436 size_t kc,
437 size_t nr,
438 size_t kr,
439 size_t sr,
440 const uint16_t* k,
441 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700442 uint16_t* packed_w,
443 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700444{
445 const size_t skr = sr * kr;
446 const size_t skc = round_down_po2(kc, skr);
447 const size_t sr_mask = (sr - 1) * kr;
448 do {
449 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
450 const size_t nr_block_size = min(nc - nr_block_start, nr);
451 if XNN_LIKELY(b != NULL) {
452 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
453 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
454 }
455 }
456 packed_w += nr;
457
458 for (size_t ki = 0; ki < ks; ki++) {
459 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
460 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
461 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
462 *packed_w++ =
463 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
464 }
465 }
466 packed_w += (nr - nr_block_size) * kr;
467 }
468
469 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
470 const size_t kr_block_size = min(kc - kr_block_start, kr);
471 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
472 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
473 *packed_w++ =
474 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
475 }
476 packed_w += kr - kr_block_size;
477 }
478 packed_w += (nr - nr_block_size) * kr;
479 }
480 }
481 }
482 k += ks * kc * nc;
483 if XNN_UNPREDICTABLE(b != NULL) {
484 b += nc;
485 }
486 } while (--g != 0);
487}
488
Marat Dukhan08b7a972020-07-14 18:17:29 -0700489void xnn_pack_qu8_conv_goki_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700490 size_t g,
491 size_t nc,
492 size_t ks,
493 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700494 size_t nr,
495 size_t kr,
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700496 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700497 const uint8_t* k,
498 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700499 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700500 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700501{
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700502 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700503 const int32_t izp = (int32_t) params->input_zero_point;
504 const int32_t boff = (int32_t) ks * (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700505 do {
506 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
507 const size_t nr_block_size = min(nc - nr_block_start, nr);
508 int32_t* packed_b = (int32_t*) packed_w;
509 if XNN_LIKELY(b != NULL) {
510 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
511 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
512 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
513 }
514 } else {
515 size_t n = nr_block_size;
516 do {
517 *((int32_t*) packed_w) = boff;
518 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
519 } while (--n != 0);
520 }
521 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
522 for (size_t ki = 0; ki < ks; ki++) {
523 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
524 const size_t kr_block_size = min(kc - kr_block_start, kr);
525 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
526 int32_t ksum = 0;
527 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
528 const uint8_t kv =
529 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
530 ksum += (int32_t) kv;
531 *((uint8_t*) packed_w) = kv;
532 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
533 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700534 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700535 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
536 }
537 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
538 }
539 }
540 }
541 k += ks * kc * nc;
542 if XNN_UNPREDICTABLE(b != NULL) {
543 b += nc;
544 }
545 } while (--g != 0);
546}
547
Marat Dukhanf9480682020-07-31 14:50:24 -0700548void xnn_pack_qs8_conv_goki_w(
549 size_t g,
550 size_t nc,
551 size_t ks,
552 size_t kc,
553 size_t nr,
554 size_t kr,
555 size_t sr,
556 const int8_t* k,
557 const int32_t* b,
558 void* packed_w,
559 const struct xnn_qs8_packing_params* params)
560{
561 assert(sr == 1);
562 const int32_t izp = (int32_t) params->input_zero_point;
563 do {
564 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
565 const size_t nr_block_size = min(nc - nr_block_start, nr);
566 int32_t* packed_b = (int32_t*) packed_w;
567 if XNN_LIKELY(b != NULL) {
568 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
569 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
570 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
571 }
572 } else {
573 size_t n = nr_block_size;
574 do {
575 *((int32_t*) packed_w) = 0;
576 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
577 } while (--n != 0);
578 }
579 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
580 for (size_t ki = 0; ki < ks; ki++) {
581 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
582 const size_t kr_block_size = min(kc - kr_block_start, kr);
583 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
584 int32_t ksum = 0;
585 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
586 const int8_t kv =
587 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
588 ksum += (int32_t) kv;
589 *((int8_t*) packed_w) = kv;
590 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
591 }
592 packed_b[nr_block_offset] -= ksum * izp;
593 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
594 }
595 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
596 }
597 }
598 }
599 k += ks * kc * nc;
600 if XNN_UNPREDICTABLE(b != NULL) {
601 b += nc;
602 }
603 } while (--g != 0);
604}
605
Marat Dukhana6879bd2020-07-06 14:25:08 -0700606void xnn_pack_f32_conv_kgo_w(
607 size_t g,
608 size_t nc,
609 size_t ks,
610 size_t nr,
611 size_t kr,
612 const float* k,
613 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700614 float* packed_w,
615 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700616{
617 for (size_t i = 0; i < g; i++) {
618 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
619 const size_t nr_block_size = min(nc - nr_block_start, nr);
620 if XNN_LIKELY(b != NULL) {
621 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
622 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
623 }
624 }
625 packed_w += nr;
626 for (size_t ki = 0; ki < ks; ki++) {
627 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
628 *packed_w =
629 k[ki * g * nc + (nr_block_start + nr_block_offset)];
630 packed_w += kr;
631 }
632 packed_w += (nr - nr_block_size) * kr;
633 }
634 }
635 k += nc;
636 if XNN_UNPREDICTABLE(b != NULL) {
637 b += nc;
638 }
639 }
640}
641
642void xnn_pack_f16_conv_kgo_w(
643 size_t g,
644 size_t nc,
645 size_t ks,
646 size_t nr,
647 size_t kr,
648 const uint16_t* k,
649 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700650 uint16_t* packed_w,
651 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700652{
653 for (size_t i = 0; i < g; i++) {
654 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
655 const size_t nr_block_size = min(nc - nr_block_start, nr);
656 if XNN_LIKELY(b != NULL) {
657 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
658 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
659 }
660 }
661 packed_w += nr;
662 for (size_t ki = 0; ki < ks; ki++) {
663 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
664 *packed_w =
665 k[ki * g * nc + (nr_block_start + nr_block_offset)];
666 packed_w += kr;
667 }
668 packed_w += (nr - nr_block_size) * kr;
669 }
670 }
671 k += nc;
672 if XNN_UNPREDICTABLE(b != NULL) {
673 b += nc;
674 }
675 }
676}
677
Marat Dukhan08b7a972020-07-14 18:17:29 -0700678void xnn_pack_qu8_conv_kgo_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700679 size_t g,
680 size_t nc,
681 size_t ks,
Marat Dukhana6879bd2020-07-06 14:25:08 -0700682 size_t nr,
683 size_t kr,
Marat Dukhanab582382020-07-06 13:32:08 -0700684 const uint8_t* k,
685 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700686 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700687 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700688{
Marat Dukhanb42f8662020-07-06 20:46:13 -0700689 const int32_t izp = (int32_t) params->input_zero_point;
690 const int32_t boff = (int32_t) ks * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700691 for (size_t i = 0; i < g; i++) {
692 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
693 const size_t nr_block_size = min(nc - nr_block_start, nr);
694 int32_t* packed_b = (int32_t*) packed_w;
695 if XNN_LIKELY(b != NULL) {
696 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
697 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
698 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
699 }
700 } else {
701 size_t n = nr_block_size;
702 do {
703 *((int32_t*) packed_w) = boff;
704 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
705 } while (--n != 0);
706 }
707 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
708 for (size_t ki = 0; ki < ks; ki++) {
709 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
710 const uint8_t kv =
711 k[ki * g * nc + (nr_block_start + nr_block_offset)];
712 *((uint8_t*) packed_w) = kv;
Marat Dukhanb42f8662020-07-06 20:46:13 -0700713 packed_b[nr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700714 packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(uint8_t));
715 }
716 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
717 }
718 }
719 k += nc;
720 if XNN_UNPREDICTABLE(b != NULL) {
721 b += nc;
722 }
723 }
724}
725
Marat Dukhana6879bd2020-07-06 14:25:08 -0700726void xnn_pack_f32_deconv_goki_w(
727 size_t g,
728 size_t nc,
729 size_t kh,
730 size_t kw,
731 size_t kc,
732 size_t sh,
733 size_t sw,
734 size_t nr,
735 size_t kr,
736 size_t sr,
737 const float* k,
738 const float* b,
739 float* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700740 struct subconvolution_params* subconv_params,
741 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700742{
743 const size_t skr = sr * kr;
744 const size_t skc = round_down_po2(kc, skr);
745 const size_t sr_mask = (sr - 1) * kr;
746 for (size_t i = 0; i < g; i++) {
747 for (size_t oy = 0; oy < sh; oy++) {
748 for (size_t ox = 0; ox < sw; ox++) {
749 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -0700750 (*subconv_params++).weights = packed_w;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700751 }
752 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
753 const size_t nr_block_size = min(nc - nr_block_start, nr);
754 if XNN_LIKELY(b != NULL) {
755 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
756 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
757 }
758 }
759 packed_w += nr;
760 for (size_t ky = oy; ky < kh; ky += sh) {
761 for (size_t kx = ox; kx < kw; kx += sw) {
762 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
763 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
764 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
765 *packed_w++ =
766 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
767 }
768 }
769 packed_w += (nr - nr_block_size) * kr;
770 }
771
772 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
773 const size_t kr_block_size = min(kc - kr_block_start, kr);
774 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
775 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
776 *packed_w++ =
777 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
778 }
779 packed_w += kr - kr_block_size;
780 }
781 packed_w += (nr - nr_block_size) * kr;
782 }
783 }
784 }
785 }
786 }
787 }
788 k += kh * kw * kc * nc;
789 if XNN_UNPREDICTABLE(b != NULL) {
790 b += nc;
791 }
792 }
793}
794
795void xnn_pack_f16_deconv_goki_w(
796 size_t g,
797 size_t nc,
798 size_t kh,
799 size_t kw,
800 size_t kc,
801 size_t sh,
802 size_t sw,
803 size_t nr,
804 size_t kr,
805 size_t sr,
806 const uint16_t* k,
807 const uint16_t* b,
808 uint16_t* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700809 struct subconvolution_params* subconv_params,
810 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700811{
812 const size_t skr = sr * kr;
813 const size_t skc = round_down_po2(kc, skr);
814 const size_t sr_mask = (sr - 1) * kr;
815 for (size_t i = 0; i < g; i++) {
816 for (size_t oy = 0; oy < sh; oy++) {
817 for (size_t ox = 0; ox < sw; ox++) {
818 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -0700819 (*subconv_params++).weights = packed_w;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700820 }
821 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
822 const size_t nr_block_size = min(nc - nr_block_start, nr);
823 if XNN_LIKELY(b != NULL) {
824 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
825 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
826 }
827 }
828 packed_w += nr;
829 for (size_t ky = oy; ky < kh; ky += sh) {
830 for (size_t kx = ox; kx < kw; kx += sw) {
831 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
832 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
833 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
834 *packed_w++ =
835 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
836 }
837 }
838 packed_w += (nr - nr_block_size) * kr;
839 }
840
841 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
842 const size_t kr_block_size = min(kc - kr_block_start, kr);
843 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
844 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
845 *packed_w++ =
846 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
847 }
848 packed_w += kr - kr_block_size;
849 }
850 packed_w += (nr - nr_block_size) * kr;
851 }
852 }
853 }
854 }
855 }
856 }
857 k += kh * kw * kc * nc;
858 if XNN_UNPREDICTABLE(b != NULL) {
859 b += nc;
860 }
861 }
862}
863
Marat Dukhan08b7a972020-07-14 18:17:29 -0700864void xnn_pack_qu8_deconv_goki_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700865 size_t g,
866 size_t nc,
867 size_t kh,
868 size_t kw,
869 size_t kc,
870 size_t sh,
871 size_t sw,
872 size_t nr,
873 size_t kr,
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700874 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700875 const uint8_t* k,
876 const int32_t* b,
877 void* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700878 struct subconvolution_params* subconv_params,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700879 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700880{
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700881 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700882 const int32_t izp = (int32_t) params->input_zero_point;
883 const int32_t kzp = (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700884 for (size_t i = 0; i < g; i++) {
885 for (size_t oy = 0; oy < sh; oy++) {
886 for (size_t ox = 0; ox < sw; ox++) {
887 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -0700888 (*subconv_params++).weights = packed_w;
Marat Dukhanab582382020-07-06 13:32:08 -0700889 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700890 const int32_t boff = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * izp * kzp;
Marat Dukhanab582382020-07-06 13:32:08 -0700891 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
892 const size_t nr_block_size = min(nc - nr_block_start, nr);
893 int32_t* packed_b = (int32_t*) packed_w;
894 if XNN_LIKELY(b != 0) {
895 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
896 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
897 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
898 }
899 } else {
900 size_t n = nr_block_size;
901 do {
902 *((int32_t*) packed_w) = boff;
903 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
904 } while (--n != 0);
905 }
906 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
907 for (size_t ky = oy; ky < kh; ky += sh) {
908 for (size_t kx = ox; kx < kw; kx += sw) {
909 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
910 const size_t kr_block_size = min(kc - kr_block_start, kr);
911 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
912 int32_t ksum = 0;
913 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
914 const uint8_t kv =
915 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
916 ksum += (int32_t) kv;
917 *((uint8_t*) packed_w) = kv;
918 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
919 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700920 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700921 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
922 }
923 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
924 }
925 }
926 }
927 }
928 }
929 }
930 k += kh * kw * kc * nc;
931 if XNN_UNPREDICTABLE(b != NULL) {
932 b += nc;
933 }
934 }
935}
936
Marat Dukhana6879bd2020-07-06 14:25:08 -0700937void xnn_pack_f32_dwconv_ghw_w(
938 size_t h,
939 size_t w,
940 size_t c,
941 size_t cr,
942 const float* k,
943 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700944 float* packed_w,
945 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700946{
947 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
948 const size_t cr_block_size = min(c - cr_block_start, cr);
949 if XNN_LIKELY(b != NULL) {
950 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
951 *packed_w++ = b[cr_block_start + cr_block_offset];
952 }
953 } else {
954 size_t n = cr_block_size;
955 do {
956 *packed_w++ = 0.0f;
957 } while (--n != 0);
958 }
959 packed_w += cr - cr_block_size;
960 for (size_t x = 0; x < w; x++) {
961 for (size_t y = 0; y < h; y++) {
962 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
963 const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
964 *packed_w++ = kv;
965 }
966 packed_w += cr - cr_block_size;
967 }
968 }
969 }
970}
971
972void xnn_pack_f16_dwconv_ghw_w(
973 size_t h,
974 size_t w,
975 size_t c,
976 size_t cr,
977 const uint16_t* k,
978 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700979 uint16_t* packed_w,
980 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700981{
982 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
983 const size_t cr_block_size = min(c - cr_block_start, cr);
984 if XNN_LIKELY(b != NULL) {
985 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
986 *packed_w++ = b[cr_block_start + cr_block_offset];
987 }
988 } else {
989 size_t n = cr_block_size;
990 do {
991 *packed_w++ = 0;
992 } while (--n != 0);
993 }
994 packed_w += cr - cr_block_size;
995 for (size_t x = 0; x < w; x++) {
996 for (size_t y = 0; y < h; y++) {
997 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
998 const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
999 *packed_w++ = kv;
1000 }
1001 packed_w += cr - cr_block_size;
1002 }
1003 }
1004 }
1005}
1006
Marat Dukhan08b7a972020-07-14 18:17:29 -07001007void xnn_pack_qu8_dwconv_ghw_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001008 size_t h,
1009 size_t w,
1010 size_t c,
1011 size_t cr,
Marat Dukhanab582382020-07-06 13:32:08 -07001012 const uint8_t* k,
1013 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001014 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001015 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001016{
Marat Dukhanb42f8662020-07-06 20:46:13 -07001017 const int32_t izp = (int32_t) params->input_zero_point;
1018 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001019 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1020 const size_t cr_block_size = min(c - cr_block_start, cr);
1021 int32_t* packed_b = (int32_t*) packed_w;
1022 if XNN_LIKELY(b != NULL) {
1023 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1024 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1025 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1026 }
1027 } else {
1028 size_t n = cr_block_size;
1029 do {
1030 *((int32_t*) packed_w) = boff;
1031 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1032 } while (--n != 0);
1033 }
1034 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1035 for (size_t x = 0; x < w; x++) {
1036 for (size_t y = 0; y < h; y++) {
1037 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1038 const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
Marat Dukhanb42f8662020-07-06 20:46:13 -07001039 packed_b[cr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001040 *((uint8_t*) packed_w) = kv;
1041 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1042 }
1043 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1044 }
1045 }
1046 }
1047}
1048
Marat Dukhana6879bd2020-07-06 14:25:08 -07001049void xnn_pack_f32_dwconv_hwg_w(
1050 size_t h,
1051 size_t w,
1052 size_t c,
1053 size_t cr,
1054 const float* k,
1055 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001056 float* packed_w,
1057 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001058{
1059 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1060 const size_t cr_block_size = min(c - cr_block_start, cr);
1061 if XNN_LIKELY(b != NULL) {
1062 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1063 *packed_w++ = b[cr_block_start + cr_block_offset];
1064 }
1065 } else {
1066 size_t n = cr_block_size;
1067 do {
1068 *packed_w++ = 0.0f;
1069 } while (--n != 0);
1070 }
1071 packed_w += cr - cr_block_size;
1072 for (size_t x = 0; x < w; x++) {
1073 for (size_t y = 0; y < h; y++) {
1074 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1075 const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1076 *packed_w++ = kv;
1077 }
1078 packed_w += cr - cr_block_size;
1079 }
1080 }
1081 }
1082}
1083
1084void xnn_pack_f16_dwconv_hwg_w(
1085 size_t h,
1086 size_t w,
1087 size_t c,
1088 size_t cr,
1089 const uint16_t* k,
1090 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001091 uint16_t* packed_w,
1092 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001093{
1094 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1095 const size_t cr_block_size = min(c - cr_block_start, cr);
1096 if XNN_LIKELY(b != NULL) {
1097 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1098 *packed_w++ = b[cr_block_start + cr_block_offset];
1099 }
1100 } else {
1101 size_t n = cr_block_size;
1102 do {
1103 *packed_w++ = 0;
1104 } while (--n != 0);
1105 }
1106 packed_w += cr - cr_block_size;
1107 for (size_t x = 0; x < w; x++) {
1108 for (size_t y = 0; y < h; y++) {
1109 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1110 const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1111 *packed_w++ = kv;
1112 }
1113 packed_w += cr - cr_block_size;
1114 }
1115 }
1116 }
1117}
1118
Marat Dukhan08b7a972020-07-14 18:17:29 -07001119void xnn_pack_qu8_dwconv_hwg_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001120 size_t h,
1121 size_t w,
1122 size_t c,
1123 size_t cr,
Marat Dukhanab582382020-07-06 13:32:08 -07001124 const uint8_t* k,
1125 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001126 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001127 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001128{
Marat Dukhanb42f8662020-07-06 20:46:13 -07001129 const int32_t izp = (int32_t) params->input_zero_point;
1130 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001131 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1132 const size_t cr_block_size = min(c - cr_block_start, cr);
1133 int32_t* packed_b = (int32_t*) packed_w;
1134 if XNN_LIKELY(b != NULL) {
1135 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1136 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1137 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1138 }
1139 } else {
1140 size_t n = cr_block_size;
1141 do {
1142 *((int32_t*) packed_w) = boff;
1143 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1144 } while (--n != 0);
1145 }
1146 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1147 for (size_t x = 0; x < w; x++) {
1148 for (size_t y = 0; y < h; y++) {
1149 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1150 const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
Marat Dukhanb42f8662020-07-06 20:46:13 -07001151 packed_b[cr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001152 *((uint8_t*) packed_w) = kv;
1153 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1154 }
1155 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1156 }
1157 }
1158 }
1159}
1160
Marat Dukhana6879bd2020-07-06 14:25:08 -07001161void xnn_pack_f32_gemminc_goi_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001162 size_t g,
1163 size_t nc,
1164 size_t kc,
1165 size_t nr,
1166 size_t kr,
1167 size_t sr,
Marat Dukhana6879bd2020-07-06 14:25:08 -07001168 const float* k,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001169 float* packed_w,
1170 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001171{
1172 const size_t skr = sr * kr;
1173 const size_t skc = round_down_po2(kc, skr);
1174 const size_t sr_mask = (sr - 1) * kr;
1175 do {
1176 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1177 const size_t nr_block_size = min(nc - nr_block_start, nr);
Marat Dukhanab582382020-07-06 13:32:08 -07001178
1179 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1180 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1181 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1182 *packed_w++ =
1183 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1184 }
1185 }
1186 packed_w += (nr - nr_block_size) * kr;
1187 }
1188
1189 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1190 const size_t kr_block_size = min(kc - kr_block_start, kr);
1191 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1192 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1193 *packed_w++ =
1194 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1195 }
1196 packed_w += kr - kr_block_size;
1197 }
1198 packed_w += (nr - nr_block_size) * kr;
1199 }
1200 }
1201 k += nc * kc;
Marat Dukhanab582382020-07-06 13:32:08 -07001202 } while (--g != 0);
1203}
1204
Marat Dukhanab582382020-07-06 13:32:08 -07001205void xnn_pack_f16_gemminc_goi_w(
1206 size_t g,
1207 size_t nc,
1208 size_t kc,
1209 size_t nr,
1210 size_t kr,
1211 size_t sr,
1212 const uint16_t* k,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001213 uint16_t* packed_w,
1214 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001215{
1216 const size_t skr = sr * kr;
1217 const size_t skc = round_down_po2(kc, skr);
1218 const size_t sr_mask = (sr - 1) * kr;
1219 do {
1220 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1221 const size_t nr_block_size = min(nc - nr_block_start, nr);
1222
1223 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1224 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1225 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1226 *packed_w++ =
1227 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1228 }
1229 }
1230 packed_w += (nr - nr_block_size) * kr;
1231 }
1232
1233 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1234 const size_t kr_block_size = min(kc - kr_block_start, kr);
1235 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1236 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1237 *packed_w++ =
1238 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1239 }
1240 packed_w += kr - kr_block_size;
1241 }
1242 packed_w += (nr - nr_block_size) * kr;
1243 }
1244 }
1245 k += nc * kc;
1246 } while (--g != 0);
1247}
1248
Marat Dukhana6879bd2020-07-06 14:25:08 -07001249void xnn_pack_f32_dconv_oki_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001250 size_t nc,
Marat Dukhanab582382020-07-06 13:32:08 -07001251 size_t kc,
1252 size_t nr,
Marat Dukhana6879bd2020-07-06 14:25:08 -07001253 size_t kh,
1254 size_t kw,
1255 const float* k,
1256 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001257 float* packed_w,
1258 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001259{
Marat Dukhana6879bd2020-07-06 14:25:08 -07001260 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1261 const size_t nr_block_size = min(nc - nr_block_start, nr);
1262 if XNN_LIKELY(b != NULL) {
1263 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1264 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
Marat Dukhanab582382020-07-06 13:32:08 -07001265 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001266 } else {
1267 size_t n = nr;
1268 do {
1269 *packed_w++ = 0.0f;
1270 } while (--n != 0);
1271 }
Marat Dukhanab582382020-07-06 13:32:08 -07001272
Marat Dukhana6879bd2020-07-06 14:25:08 -07001273 for (size_t kx = 0; kx < kw; kx++) {
1274 for (size_t c = 0; c < kc; c++) {
1275 for (size_t ky = 0; ky < kh; ky++) {
1276 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1277 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
Marat Dukhanab582382020-07-06 13:32:08 -07001278 }
Marat Dukhanab582382020-07-06 13:32:08 -07001279 }
1280 }
1281 }
Marat Dukhanab582382020-07-06 13:32:08 -07001282 if XNN_UNPREDICTABLE(b != NULL) {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001283 b += nr;
Marat Dukhanab582382020-07-06 13:32:08 -07001284 }
1285 }
1286}
1287
1288void xnn_pack_f16_dconv_oki_w(
1289 size_t nc,
1290 size_t kc,
1291 size_t nr,
1292 size_t kh,
1293 size_t kw,
1294 const uint16_t* k,
1295 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001296 uint16_t* packed_w,
1297 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001298{
1299 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1300 const size_t nr_block_size = min(nc - nr_block_start, nr);
1301 if XNN_LIKELY(b != NULL) {
1302 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1303 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1304 }
1305 } else {
1306 size_t n = nr;
1307 do {
1308 *packed_w++ = 0;
1309 } while (--n != 0);
1310 }
1311
1312 for (size_t kx = 0; kx < kw; kx++) {
1313 for (size_t c = 0; c < kc; c++) {
1314 for (size_t ky = 0; ky < kh; ky++) {
1315 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1316 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1317 }
1318 }
1319 }
1320 }
1321 if XNN_UNPREDICTABLE(b != NULL) {
1322 b += nr;
1323 }
1324 }
1325}
1326
Marat Dukhana6879bd2020-07-06 14:25:08 -07001327void xnn_pack_f32_chw_dwconv_ghw_w(
1328 size_t kernel_size,
1329 size_t groups,
1330 const float* kernel,
1331 const float* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001332 float* packed_weights,
1333 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001334{
Marat Dukhana6879bd2020-07-06 14:25:08 -07001335 for (size_t g = 0; g < groups; g++) {
1336 if XNN_LIKELY(bias != NULL) {
1337 *packed_weights = *bias++;
Marat Dukhanab582382020-07-06 13:32:08 -07001338 } else {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001339 *packed_weights = 0.0f;
Marat Dukhanab582382020-07-06 13:32:08 -07001340 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001341 packed_weights += 1;
1342 for (size_t i = 0; i < kernel_size; i++) {
1343 *packed_weights++ = kernel[g * kernel_size + i];
Marat Dukhanab582382020-07-06 13:32:08 -07001344 }
1345 }
1346}
1347
1348void xnn_pack_f16_chw_dwconv_ghw_w(
1349 size_t kernel_size,
1350 size_t groups,
1351 const uint16_t* kernel,
1352 const uint16_t* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001353 uint16_t* packed_weights,
1354 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001355{
1356 for (size_t g = 0; g < groups; g++) {
1357 if XNN_LIKELY(bias != NULL) {
1358 *packed_weights = *bias++;
1359 } else {
1360 *packed_weights = 0;
1361 }
1362 packed_weights += 1;
1363 for (size_t i = 0; i < kernel_size; i++) {
1364 *packed_weights++ = kernel[g * kernel_size + i];
1365 }
1366 }
1367}
1368
Marat Dukhanab582382020-07-06 13:32:08 -07001369void xnn_pack_f32_chw_dwconv_hwg_w(
1370 size_t kernel_size,
1371 size_t groups,
1372 const float* kernel,
1373 const float* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001374 float* packed_weights,
1375 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001376{
1377 for (size_t g = 0; g < groups; g++) {
1378 if XNN_LIKELY(bias != NULL) {
1379 *packed_weights = *bias++;
1380 } else {
1381 *packed_weights = 0.0f;
1382 }
1383 packed_weights += 1;
1384 for (size_t i = 0; i < kernel_size; i++) {
1385 *packed_weights++ = kernel[i * groups + g];
1386 }
1387 }
1388}
1389
1390void xnn_pack_f32_vmulcaddc_w(
1391 size_t c,
1392 size_t cr,
1393 const float* s,
1394 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001395 float* packed_w,
1396 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001397{
1398 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1399 const size_t cr_block_size = min(c - cr_block_start, cr);
1400 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1401 *packed_w++ = s[cr_block_start + cr_block_offset];
1402 }
1403 packed_w += cr - cr_block_size;
1404 if XNN_LIKELY(b != NULL) {
1405 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1406 *packed_w++ = b[cr_block_start + cr_block_offset];
1407 }
1408 } else {
1409 size_t n = cr_block_size;
1410 do {
1411 *packed_w++ = 0.0f;
1412 } while (--n != 0);
1413 }
1414 packed_w += cr - cr_block_size;
1415 }
1416}
1417
1418void xnn_pack_f16_vmulcaddc_w(
1419 size_t c,
1420 size_t cr,
1421 const uint16_t* s,
1422 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001423 uint16_t* packed_w,
1424 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001425{
1426 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1427 const size_t cr_block_size = min(c - cr_block_start, cr);
1428 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1429 *packed_w++ = s[cr_block_start + cr_block_offset];
1430 }
1431 packed_w += cr - cr_block_size;
1432 if XNN_LIKELY(b != NULL) {
1433 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1434 *packed_w++ = b[cr_block_start + cr_block_offset];
1435 }
1436 } else {
1437 size_t n = cr_block_size;
1438 do {
1439 *packed_w++ = 0;
1440 } while (--n != 0);
1441 }
1442 packed_w += cr - cr_block_size;
1443 }
1444}