blob: 24ade5b7ed2e87bf6440b93346f230812dde246d [file] [log] [blame]
Marat Dukhanab582382020-07-06 13:32:08 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <stdint.h>
10#include <stddef.h>
11
12#include <xnnpack/math.h>
13#include <xnnpack/pack.h>
14
15
Marat Dukhana6879bd2020-07-06 14:25:08 -070016void xnn_pack_f32_gemm_goi_w(
17 size_t g,
18 size_t nc,
19 size_t kc,
20 size_t nr,
21 size_t kr,
22 size_t sr,
23 const float* k,
24 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -070025 float* packed_w,
26 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -070027{
28 const size_t skr = sr * kr;
29 const size_t skc = round_down_po2(kc, skr);
30 const size_t sr_mask = (sr - 1) * kr;
31 do {
32 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
33 const size_t nr_block_size = min(nc - nr_block_start, nr);
34 if XNN_LIKELY(b != NULL) {
35 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
36 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
37 }
38 }
39 packed_w += nr;
40
41 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
42 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
43 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
44 *packed_w++ =
45 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
46 }
47 }
48 packed_w += (nr - nr_block_size) * kr;
49 }
50
51 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
52 const size_t kr_block_size = min(kc - kr_block_start, kr);
53 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
54 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
55 *packed_w++ =
56 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
57 }
58 packed_w += kr - kr_block_size;
59 }
60 packed_w += (nr - nr_block_size) * kr;
61 }
62 }
63 k += nc * kc;
64 if XNN_UNPREDICTABLE(b != NULL) {
65 b += nc;
66 }
67 } while (--g != 0);
68}
69
70void xnn_pack_f16_gemm_goi_w(
71 size_t g,
72 size_t nc,
73 size_t kc,
74 size_t nr,
75 size_t kr,
76 size_t sr,
77 const uint16_t* k,
78 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -070079 uint16_t* packed_w,
80 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -070081{
82 const size_t skr = sr * kr;
83 const size_t skc = round_down_po2(kc, skr);
84 const size_t sr_mask = (sr - 1) * kr;
85 do {
86 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
87 const size_t nr_block_size = min(nc - nr_block_start, nr);
88 if XNN_LIKELY(b != NULL) {
89 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
90 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
91 }
92 }
93 packed_w += nr;
94
95 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
96 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
97 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
98 *packed_w++ =
99 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
100 }
101 }
102 packed_w += (nr - nr_block_size) * kr;
103 }
104
105 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
106 const size_t kr_block_size = min(kc - kr_block_start, kr);
107 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
108 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
109 *packed_w++ =
110 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
111 }
112 packed_w += kr - kr_block_size;
113 }
114 packed_w += (nr - nr_block_size) * kr;
115 }
116 }
117 k += nc * kc;
118 if XNN_UNPREDICTABLE(b != NULL) {
119 b += nc;
120 }
121 } while (--g != 0);
122}
123
Marat Dukhan08b7a972020-07-14 18:17:29 -0700124void xnn_pack_qu8_gemm_goi_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700125 size_t g,
126 size_t nc,
127 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700128 size_t nr,
129 size_t kr,
Marat Dukhanbc08f312020-07-07 16:22:04 -0700130 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700131 const uint8_t* k,
132 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700133 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700134 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700135{
Marat Dukhanbc08f312020-07-07 16:22:04 -0700136 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700137 const int32_t izp = (int32_t) params->input_zero_point;
138 const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700139 do {
140 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
141 const size_t nr_block_size = min(nc - nr_block_start, nr);
142 int32_t* packed_b = (int32_t*) packed_w;
143 if XNN_LIKELY(b != NULL) {
144 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
145 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
146 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
147 }
148 } else {
149 size_t n = nr_block_size;
150 do {
151 *((int32_t*) packed_w) = boff;
152 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
153 } while (--n != 0);
154 }
155 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
156 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
157 const size_t kr_block_size = min(kc - kr_block_start, kr);
158 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
159 int32_t ksum = 0;
160 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
161 const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
162 ksum += (int32_t) kv;
163 *((uint8_t*) packed_w) = kv;
164 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
165 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700166 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700167 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
168 }
169 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
170 }
171 }
172 k += nc * kc;
173 if XNN_UNPREDICTABLE(b != NULL) {
174 b += nc;
175 }
176 } while (--g != 0);
177}
178
Marat Dukhan595e1702020-07-31 10:12:52 -0700179void xnn_pack_qs8_gemm_goi_w(
180 size_t g,
181 size_t nc,
182 size_t kc,
183 size_t nr,
184 size_t kr,
185 size_t sr,
186 const int8_t* k,
187 const int32_t* b,
188 void* packed_w,
189 const struct xnn_qs8_packing_params* params)
190{
191 assert(sr == 1);
192 const int32_t izp = (int32_t) params->input_zero_point;
193 do {
194 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
195 const size_t nr_block_size = min(nc - nr_block_start, nr);
196 int32_t* packed_b = (int32_t*) packed_w;
197 if XNN_LIKELY(b != NULL) {
198 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
199 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
200 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
201 }
202 } else {
203 size_t n = nr_block_size;
204 do {
205 *((int32_t*) packed_w) = 0;
206 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
207 } while (--n != 0);
208 }
209 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
210 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
211 const size_t kr_block_size = min(kc - kr_block_start, kr);
212 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
213 int32_t ksum = 0;
214 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
215 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
216 ksum += (int32_t) kv;
217 *((int8_t*) packed_w) = kv;
218 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
219 }
220 packed_b[nr_block_offset] -= ksum * izp;
221 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
222 }
223 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
224 }
225 }
226 k += nc * kc;
227 if XNN_UNPREDICTABLE(b != NULL) {
228 b += nc;
229 }
230 } while (--g != 0);
231}
232
Marat Dukhan683fab32020-08-03 19:42:52 -0700233void xnn_pack_qs8_gemm_xw_goi_w(
234 size_t g,
235 size_t nc,
236 size_t kc,
237 size_t nr,
238 size_t kr,
239 size_t sr,
240 const int8_t* k,
241 const int32_t* b,
242 void* packed_w,
243 const struct xnn_qs8_packing_params* params)
244{
245 assert(sr == 1);
246 const int32_t izp = (int32_t) params->input_zero_point;
247 do {
248 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
249 const size_t nr_block_size = min(nc - nr_block_start, nr);
250 int32_t* packed_b = (int32_t*) packed_w;
251 if XNN_LIKELY(b != NULL) {
252 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
253 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
254 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
255 }
256 } else {
257 size_t n = nr_block_size;
258 do {
259 *((int32_t*) packed_w) = 0;
260 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
261 } while (--n != 0);
262 }
263 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
264 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
265 const size_t kr_block_size = min(kc - kr_block_start, kr);
266 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
267 int32_t ksum = 0;
268 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
269 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
270 ksum += (int32_t) kv;
271 *((int16_t*) packed_w) = (int16_t) kv;
272 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int16_t));
273 }
274 packed_b[nr_block_offset] -= ksum * izp;
275 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int16_t));
276 }
277 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int16_t));
278 }
279 }
280 k += nc * kc;
281 if XNN_UNPREDICTABLE(b != NULL) {
282 b += nc;
283 }
284 } while (--g != 0);
285}
286
Marat Dukhana6879bd2020-07-06 14:25:08 -0700287void xnn_pack_f32_gemm_io_w(
288 size_t nc,
289 size_t kc,
290 size_t nr,
291 size_t kr,
292 size_t sr,
293 const float* k,
294 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700295 float* packed_w,
296 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700297{
298 const size_t skr = sr * kr;
299 const size_t skc = round_down_po2(kc, skr);
300 const size_t sr_mask = (sr - 1) * kr;
301 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
302 const size_t nr_block_size = min(nc - nr_block_start, nr);
303 if XNN_LIKELY(b != NULL) {
304 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
305 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
306 }
307 }
308 packed_w += nr;
309
310 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
311 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
312 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
313 *packed_w++ =
314 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
315 }
316 }
317 packed_w += (nr - nr_block_size) * kr;
318 }
319
320 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
321 const size_t kr_block_size = min(kc - kr_block_start, kr);
322 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
323 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
324 *packed_w++ =
325 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
326 }
327 packed_w += kr - kr_block_size;
328 }
329 packed_w += (nr - nr_block_size) * kr;
330 }
331 }
332}
333
334void xnn_pack_f16_gemm_io_w(
335 size_t nc,
336 size_t kc,
337 size_t nr,
338 size_t kr,
339 size_t sr,
340 const uint16_t* k,
341 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700342 uint16_t* packed_w,
343 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700344{
345 const size_t skr = sr * kr;
346 const size_t skc = round_down_po2(kc, skr);
347 const size_t sr_mask = (sr - 1) * kr;
348 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
349 const size_t nr_block_size = min(nc - nr_block_start, nr);
350 if XNN_LIKELY(b != NULL) {
351 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
352 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
353 }
354 }
355 packed_w += nr;
356
357 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
358 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
359 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
360 *packed_w++ =
361 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
362 }
363 }
364 packed_w += (nr - nr_block_size) * kr;
365 }
366
367 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
368 const size_t kr_block_size = min(kc - kr_block_start, kr);
369 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
370 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
371 *packed_w++ =
372 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
373 }
374 packed_w += kr - kr_block_size;
375 }
376 packed_w += (nr - nr_block_size) * kr;
377 }
378 }
379}
380
Marat Dukhan08b7a972020-07-14 18:17:29 -0700381void xnn_pack_qu8_gemm_io_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700382 size_t nc,
383 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700384 size_t nr,
385 size_t kr,
Marat Dukhanbc08f312020-07-07 16:22:04 -0700386 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700387 const uint8_t* k,
388 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700389 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700390 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700391{
Marat Dukhanbc08f312020-07-07 16:22:04 -0700392 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700393 const int32_t izp = (int32_t) params->input_zero_point;
394 const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700395 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
396 const size_t nr_block_size = min(nc - nr_block_start, nr);
397 int32_t* packed_b = (int32_t*) packed_w;
398 if XNN_LIKELY(b != NULL) {
399 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
400 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
401 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
402 }
403 } else {
404 size_t n = nr_block_size;
405 do {
406 *((int32_t*) packed_w) = boff;
407 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
408 } while (--n != 0);
409 }
410 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
411 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
412 const size_t kr_block_size = min(kc - kr_block_start, kr);
413 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
414 int32_t ksum = 0;
415 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
416 const uint8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
417 ksum += (int32_t) kv;
418 *((uint8_t*) packed_w) = kv;
419 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
420 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700421 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700422 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
423 }
424 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
425 }
426 }
427}
428
Marat Dukhana6879bd2020-07-06 14:25:08 -0700429void xnn_pack_f32_conv_goki_w(
430 size_t g,
431 size_t nc,
432 size_t ks,
433 size_t kc,
434 size_t nr,
435 size_t kr,
436 size_t sr,
437 const float* k,
438 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700439 float* packed_w,
440 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700441{
442 const size_t skr = sr * kr;
443 const size_t skc = round_down_po2(kc, skr);
444 const size_t sr_mask = (sr - 1) * kr;
445 do {
446 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
447 const size_t nr_block_size = min(nc - nr_block_start, nr);
448 if XNN_LIKELY(b != NULL) {
449 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
450 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
451 }
452 }
453 packed_w += nr;
454
455 for (size_t ki = 0; ki < ks; ki++) {
456 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
457 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
458 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
459 *packed_w++ =
460 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
461 }
462 }
463 packed_w += (nr - nr_block_size) * kr;
464 }
465
466 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
467 const size_t kr_block_size = min(kc - kr_block_start, kr);
468 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
469 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
470 *packed_w++ =
471 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
472 }
473 packed_w += kr - kr_block_size;
474 }
475 packed_w += (nr - nr_block_size) * kr;
476 }
477 }
478 }
479 k += ks * kc * nc;
480 if XNN_UNPREDICTABLE(b != NULL) {
481 b += nc;
482 }
483 } while (--g != 0);
484}
485
486void xnn_pack_f16_conv_goki_w(
487 size_t g,
488 size_t nc,
489 size_t ks,
490 size_t kc,
491 size_t nr,
492 size_t kr,
493 size_t sr,
494 const uint16_t* k,
495 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700496 uint16_t* packed_w,
497 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700498{
499 const size_t skr = sr * kr;
500 const size_t skc = round_down_po2(kc, skr);
501 const size_t sr_mask = (sr - 1) * kr;
502 do {
503 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
504 const size_t nr_block_size = min(nc - nr_block_start, nr);
505 if XNN_LIKELY(b != NULL) {
506 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
507 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
508 }
509 }
510 packed_w += nr;
511
512 for (size_t ki = 0; ki < ks; ki++) {
513 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
514 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
515 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
516 *packed_w++ =
517 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
518 }
519 }
520 packed_w += (nr - nr_block_size) * kr;
521 }
522
523 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
524 const size_t kr_block_size = min(kc - kr_block_start, kr);
525 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
526 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
527 *packed_w++ =
528 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
529 }
530 packed_w += kr - kr_block_size;
531 }
532 packed_w += (nr - nr_block_size) * kr;
533 }
534 }
535 }
536 k += ks * kc * nc;
537 if XNN_UNPREDICTABLE(b != NULL) {
538 b += nc;
539 }
540 } while (--g != 0);
541}
542
Marat Dukhan08b7a972020-07-14 18:17:29 -0700543void xnn_pack_qu8_conv_goki_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700544 size_t g,
545 size_t nc,
546 size_t ks,
547 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700548 size_t nr,
549 size_t kr,
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700550 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700551 const uint8_t* k,
552 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700553 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700554 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700555{
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700556 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700557 const int32_t izp = (int32_t) params->input_zero_point;
558 const int32_t boff = (int32_t) ks * (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700559 do {
560 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
561 const size_t nr_block_size = min(nc - nr_block_start, nr);
562 int32_t* packed_b = (int32_t*) packed_w;
563 if XNN_LIKELY(b != NULL) {
564 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
565 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
566 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
567 }
568 } else {
569 size_t n = nr_block_size;
570 do {
571 *((int32_t*) packed_w) = boff;
572 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
573 } while (--n != 0);
574 }
575 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
576 for (size_t ki = 0; ki < ks; ki++) {
577 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
578 const size_t kr_block_size = min(kc - kr_block_start, kr);
579 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
580 int32_t ksum = 0;
581 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
582 const uint8_t kv =
583 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
584 ksum += (int32_t) kv;
585 *((uint8_t*) packed_w) = kv;
586 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
587 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700588 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700589 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
590 }
591 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
592 }
593 }
594 }
595 k += ks * kc * nc;
596 if XNN_UNPREDICTABLE(b != NULL) {
597 b += nc;
598 }
599 } while (--g != 0);
600}
601
Marat Dukhanf9480682020-07-31 14:50:24 -0700602void xnn_pack_qs8_conv_goki_w(
603 size_t g,
604 size_t nc,
605 size_t ks,
606 size_t kc,
607 size_t nr,
608 size_t kr,
609 size_t sr,
610 const int8_t* k,
611 const int32_t* b,
612 void* packed_w,
613 const struct xnn_qs8_packing_params* params)
614{
615 assert(sr == 1);
616 const int32_t izp = (int32_t) params->input_zero_point;
617 do {
618 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
619 const size_t nr_block_size = min(nc - nr_block_start, nr);
620 int32_t* packed_b = (int32_t*) packed_w;
621 if XNN_LIKELY(b != NULL) {
622 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
623 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
624 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
625 }
626 } else {
627 size_t n = nr_block_size;
628 do {
629 *((int32_t*) packed_w) = 0;
630 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
631 } while (--n != 0);
632 }
633 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
634 for (size_t ki = 0; ki < ks; ki++) {
635 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
636 const size_t kr_block_size = min(kc - kr_block_start, kr);
637 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
638 int32_t ksum = 0;
639 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
640 const int8_t kv =
641 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
642 ksum += (int32_t) kv;
643 *((int8_t*) packed_w) = kv;
644 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
645 }
646 packed_b[nr_block_offset] -= ksum * izp;
647 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
648 }
649 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
650 }
651 }
652 }
653 k += ks * kc * nc;
654 if XNN_UNPREDICTABLE(b != NULL) {
655 b += nc;
656 }
657 } while (--g != 0);
658}
659
Marat Dukhana6879bd2020-07-06 14:25:08 -0700660void xnn_pack_f32_conv_kgo_w(
661 size_t g,
662 size_t nc,
663 size_t ks,
664 size_t nr,
665 size_t kr,
666 const float* k,
667 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700668 float* packed_w,
669 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700670{
671 for (size_t i = 0; i < g; i++) {
672 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
673 const size_t nr_block_size = min(nc - nr_block_start, nr);
674 if XNN_LIKELY(b != NULL) {
675 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
676 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
677 }
678 }
679 packed_w += nr;
680 for (size_t ki = 0; ki < ks; ki++) {
681 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
682 *packed_w =
683 k[ki * g * nc + (nr_block_start + nr_block_offset)];
684 packed_w += kr;
685 }
686 packed_w += (nr - nr_block_size) * kr;
687 }
688 }
689 k += nc;
690 if XNN_UNPREDICTABLE(b != NULL) {
691 b += nc;
692 }
693 }
694}
695
696void xnn_pack_f16_conv_kgo_w(
697 size_t g,
698 size_t nc,
699 size_t ks,
700 size_t nr,
701 size_t kr,
702 const uint16_t* k,
703 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700704 uint16_t* packed_w,
705 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700706{
707 for (size_t i = 0; i < g; i++) {
708 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
709 const size_t nr_block_size = min(nc - nr_block_start, nr);
710 if XNN_LIKELY(b != NULL) {
711 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
712 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
713 }
714 }
715 packed_w += nr;
716 for (size_t ki = 0; ki < ks; ki++) {
717 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
718 *packed_w =
719 k[ki * g * nc + (nr_block_start + nr_block_offset)];
720 packed_w += kr;
721 }
722 packed_w += (nr - nr_block_size) * kr;
723 }
724 }
725 k += nc;
726 if XNN_UNPREDICTABLE(b != NULL) {
727 b += nc;
728 }
729 }
730}
731
Marat Dukhan08b7a972020-07-14 18:17:29 -0700732void xnn_pack_qu8_conv_kgo_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700733 size_t g,
734 size_t nc,
735 size_t ks,
Marat Dukhana6879bd2020-07-06 14:25:08 -0700736 size_t nr,
737 size_t kr,
Marat Dukhanab582382020-07-06 13:32:08 -0700738 const uint8_t* k,
739 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700740 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700741 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700742{
Marat Dukhanb42f8662020-07-06 20:46:13 -0700743 const int32_t izp = (int32_t) params->input_zero_point;
744 const int32_t boff = (int32_t) ks * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700745 for (size_t i = 0; i < g; i++) {
746 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
747 const size_t nr_block_size = min(nc - nr_block_start, nr);
748 int32_t* packed_b = (int32_t*) packed_w;
749 if XNN_LIKELY(b != NULL) {
750 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
751 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
752 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
753 }
754 } else {
755 size_t n = nr_block_size;
756 do {
757 *((int32_t*) packed_w) = boff;
758 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
759 } while (--n != 0);
760 }
761 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
762 for (size_t ki = 0; ki < ks; ki++) {
763 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
764 const uint8_t kv =
765 k[ki * g * nc + (nr_block_start + nr_block_offset)];
766 *((uint8_t*) packed_w) = kv;
Marat Dukhanb42f8662020-07-06 20:46:13 -0700767 packed_b[nr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700768 packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(uint8_t));
769 }
770 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
771 }
772 }
773 k += nc;
774 if XNN_UNPREDICTABLE(b != NULL) {
775 b += nc;
776 }
777 }
778}
779
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700780void xnn_pack_qs8_conv_kgo_w(
781 size_t g,
782 size_t nc,
783 size_t ks,
784 size_t nr,
785 size_t kr,
786 const int8_t* k,
787 const int32_t* b,
788 void* packed_w,
789 const struct xnn_qs8_packing_params* params)
790{
791 const int32_t izp = (int32_t) params->input_zero_point;
792 for (size_t i = 0; i < g; i++) {
793 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
794 const size_t nr_block_size = min(nc - nr_block_start, nr);
795 int32_t* packed_b = (int32_t*) packed_w;
796 if XNN_LIKELY(b != NULL) {
797 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
798 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
799 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
800 }
801 } else {
802 size_t n = nr_block_size;
803 do {
804 *((int32_t*) packed_w) = 0;
805 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
806 } while (--n != 0);
807 }
808 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
809 for (size_t ki = 0; ki < ks; ki++) {
810 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
811 const int8_t kv =
812 k[ki * g * nc + (nr_block_start + nr_block_offset)];
813 *((int8_t*) packed_w) = kv;
814 packed_b[nr_block_offset] -= (int32_t) kv * izp;
815 packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(int8_t));
816 }
817 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
818 }
819 }
820 k += nc;
821 if XNN_UNPREDICTABLE(b != NULL) {
822 b += nc;
823 }
824 }
825}
826
Marat Dukhana6879bd2020-07-06 14:25:08 -0700827void xnn_pack_f32_deconv_goki_w(
828 size_t g,
829 size_t nc,
830 size_t kh,
831 size_t kw,
832 size_t kc,
833 size_t sh,
834 size_t sw,
835 size_t nr,
836 size_t kr,
837 size_t sr,
838 const float* k,
839 const float* b,
840 float* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700841 struct subconvolution_params* subconv_params,
842 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700843{
844 const size_t skr = sr * kr;
845 const size_t skc = round_down_po2(kc, skr);
846 const size_t sr_mask = (sr - 1) * kr;
847 for (size_t i = 0; i < g; i++) {
848 for (size_t oy = 0; oy < sh; oy++) {
849 for (size_t ox = 0; ox < sw; ox++) {
850 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -0700851 (*subconv_params++).weights = packed_w;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700852 }
853 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
854 const size_t nr_block_size = min(nc - nr_block_start, nr);
855 if XNN_LIKELY(b != NULL) {
856 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
857 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
858 }
859 }
860 packed_w += nr;
861 for (size_t ky = oy; ky < kh; ky += sh) {
862 for (size_t kx = ox; kx < kw; kx += sw) {
863 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
864 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
865 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
866 *packed_w++ =
867 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
868 }
869 }
870 packed_w += (nr - nr_block_size) * kr;
871 }
872
873 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
874 const size_t kr_block_size = min(kc - kr_block_start, kr);
875 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
876 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
877 *packed_w++ =
878 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
879 }
880 packed_w += kr - kr_block_size;
881 }
882 packed_w += (nr - nr_block_size) * kr;
883 }
884 }
885 }
886 }
887 }
888 }
889 k += kh * kw * kc * nc;
890 if XNN_UNPREDICTABLE(b != NULL) {
891 b += nc;
892 }
893 }
894}
895
896void xnn_pack_f16_deconv_goki_w(
897 size_t g,
898 size_t nc,
899 size_t kh,
900 size_t kw,
901 size_t kc,
902 size_t sh,
903 size_t sw,
904 size_t nr,
905 size_t kr,
906 size_t sr,
907 const uint16_t* k,
908 const uint16_t* b,
909 uint16_t* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700910 struct subconvolution_params* subconv_params,
911 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700912{
913 const size_t skr = sr * kr;
914 const size_t skc = round_down_po2(kc, skr);
915 const size_t sr_mask = (sr - 1) * kr;
916 for (size_t i = 0; i < g; i++) {
917 for (size_t oy = 0; oy < sh; oy++) {
918 for (size_t ox = 0; ox < sw; ox++) {
919 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -0700920 (*subconv_params++).weights = packed_w;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700921 }
922 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
923 const size_t nr_block_size = min(nc - nr_block_start, nr);
924 if XNN_LIKELY(b != NULL) {
925 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
926 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
927 }
928 }
929 packed_w += nr;
930 for (size_t ky = oy; ky < kh; ky += sh) {
931 for (size_t kx = ox; kx < kw; kx += sw) {
932 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
933 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
934 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
935 *packed_w++ =
936 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
937 }
938 }
939 packed_w += (nr - nr_block_size) * kr;
940 }
941
942 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
943 const size_t kr_block_size = min(kc - kr_block_start, kr);
944 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
945 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
946 *packed_w++ =
947 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
948 }
949 packed_w += kr - kr_block_size;
950 }
951 packed_w += (nr - nr_block_size) * kr;
952 }
953 }
954 }
955 }
956 }
957 }
958 k += kh * kw * kc * nc;
959 if XNN_UNPREDICTABLE(b != NULL) {
960 b += nc;
961 }
962 }
963}
964
Marat Dukhan08b7a972020-07-14 18:17:29 -0700965void xnn_pack_qu8_deconv_goki_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700966 size_t g,
967 size_t nc,
968 size_t kh,
969 size_t kw,
970 size_t kc,
971 size_t sh,
972 size_t sw,
973 size_t nr,
974 size_t kr,
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700975 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700976 const uint8_t* k,
977 const int32_t* b,
978 void* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700979 struct subconvolution_params* subconv_params,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700980 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700981{
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700982 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700983 const int32_t izp = (int32_t) params->input_zero_point;
984 const int32_t kzp = (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700985 for (size_t i = 0; i < g; i++) {
986 for (size_t oy = 0; oy < sh; oy++) {
987 for (size_t ox = 0; ox < sw; ox++) {
988 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -0700989 (*subconv_params++).weights = packed_w;
Marat Dukhanab582382020-07-06 13:32:08 -0700990 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700991 const int32_t boff = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * izp * kzp;
Marat Dukhanab582382020-07-06 13:32:08 -0700992 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
993 const size_t nr_block_size = min(nc - nr_block_start, nr);
994 int32_t* packed_b = (int32_t*) packed_w;
995 if XNN_LIKELY(b != 0) {
996 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
997 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
998 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
999 }
1000 } else {
1001 size_t n = nr_block_size;
1002 do {
1003 *((int32_t*) packed_w) = boff;
1004 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1005 } while (--n != 0);
1006 }
1007 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1008 for (size_t ky = oy; ky < kh; ky += sh) {
1009 for (size_t kx = ox; kx < kw; kx += sw) {
1010 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
1011 const size_t kr_block_size = min(kc - kr_block_start, kr);
1012 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1013 int32_t ksum = 0;
1014 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1015 const uint8_t kv =
1016 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
1017 ksum += (int32_t) kv;
1018 *((uint8_t*) packed_w) = kv;
1019 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1020 }
Marat Dukhanb42f8662020-07-06 20:46:13 -07001021 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001022 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
1023 }
1024 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
1025 }
1026 }
1027 }
1028 }
1029 }
1030 }
1031 k += kh * kw * kc * nc;
1032 if XNN_UNPREDICTABLE(b != NULL) {
1033 b += nc;
1034 }
1035 }
1036}
1037
Marat Dukhana6879bd2020-07-06 14:25:08 -07001038void xnn_pack_f32_dwconv_ghw_w(
1039 size_t h,
1040 size_t w,
1041 size_t c,
1042 size_t cr,
1043 const float* k,
1044 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001045 float* packed_w,
1046 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001047{
1048 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1049 const size_t cr_block_size = min(c - cr_block_start, cr);
1050 if XNN_LIKELY(b != NULL) {
1051 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1052 *packed_w++ = b[cr_block_start + cr_block_offset];
1053 }
1054 } else {
1055 size_t n = cr_block_size;
1056 do {
1057 *packed_w++ = 0.0f;
1058 } while (--n != 0);
1059 }
1060 packed_w += cr - cr_block_size;
1061 for (size_t x = 0; x < w; x++) {
1062 for (size_t y = 0; y < h; y++) {
1063 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1064 const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1065 *packed_w++ = kv;
1066 }
1067 packed_w += cr - cr_block_size;
1068 }
1069 }
1070 }
1071}
1072
1073void xnn_pack_f16_dwconv_ghw_w(
1074 size_t h,
1075 size_t w,
1076 size_t c,
1077 size_t cr,
1078 const uint16_t* k,
1079 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001080 uint16_t* packed_w,
1081 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001082{
1083 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1084 const size_t cr_block_size = min(c - cr_block_start, cr);
1085 if XNN_LIKELY(b != NULL) {
1086 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1087 *packed_w++ = b[cr_block_start + cr_block_offset];
1088 }
1089 } else {
1090 size_t n = cr_block_size;
1091 do {
1092 *packed_w++ = 0;
1093 } while (--n != 0);
1094 }
1095 packed_w += cr - cr_block_size;
1096 for (size_t x = 0; x < w; x++) {
1097 for (size_t y = 0; y < h; y++) {
1098 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1099 const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1100 *packed_w++ = kv;
1101 }
1102 packed_w += cr - cr_block_size;
1103 }
1104 }
1105 }
1106}
1107
Marat Dukhan08b7a972020-07-14 18:17:29 -07001108void xnn_pack_qu8_dwconv_ghw_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001109 size_t h,
1110 size_t w,
1111 size_t c,
1112 size_t cr,
Marat Dukhanab582382020-07-06 13:32:08 -07001113 const uint8_t* k,
1114 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001115 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001116 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001117{
Marat Dukhanb42f8662020-07-06 20:46:13 -07001118 const int32_t izp = (int32_t) params->input_zero_point;
1119 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001120 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1121 const size_t cr_block_size = min(c - cr_block_start, cr);
1122 int32_t* packed_b = (int32_t*) packed_w;
1123 if XNN_LIKELY(b != NULL) {
1124 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1125 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1126 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1127 }
1128 } else {
1129 size_t n = cr_block_size;
1130 do {
1131 *((int32_t*) packed_w) = boff;
1132 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1133 } while (--n != 0);
1134 }
1135 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1136 for (size_t x = 0; x < w; x++) {
1137 for (size_t y = 0; y < h; y++) {
1138 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1139 const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
Marat Dukhanb42f8662020-07-06 20:46:13 -07001140 packed_b[cr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001141 *((uint8_t*) packed_w) = kv;
1142 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1143 }
1144 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1145 }
1146 }
1147 }
1148}
1149
Marat Dukhanf62bbdc2020-08-04 13:59:04 -07001150void xnn_pack_qs8_dwconv_ghw_w(
1151 size_t h,
1152 size_t w,
1153 size_t c,
1154 size_t cr,
1155 const int8_t* k,
1156 const int32_t* b,
1157 void* packed_w,
1158 const struct xnn_qs8_packing_params* params)
1159{
1160 const int32_t izp = (int32_t) params->input_zero_point;
1161 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1162 const size_t cr_block_size = min(c - cr_block_start, cr);
1163 int32_t* packed_b = (int32_t*) packed_w;
1164 if XNN_LIKELY(b != NULL) {
1165 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1166 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
1167 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1168 }
1169 } else {
1170 size_t n = cr_block_size;
1171 do {
1172 *((int32_t*) packed_w) = 0;
1173 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1174 } while (--n != 0);
1175 }
1176 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1177 for (size_t x = 0; x < w; x++) {
1178 for (size_t y = 0; y < h; y++) {
1179 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1180 const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1181 packed_b[cr_block_offset] -= (int32_t) kv * izp;
1182 *((int8_t*) packed_w) = kv;
1183 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1184 }
1185 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1186 }
1187 }
1188 }
1189}
1190
Marat Dukhana6879bd2020-07-06 14:25:08 -07001191void xnn_pack_f32_dwconv_hwg_w(
1192 size_t h,
1193 size_t w,
1194 size_t c,
1195 size_t cr,
1196 const float* k,
1197 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001198 float* packed_w,
1199 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001200{
1201 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1202 const size_t cr_block_size = min(c - cr_block_start, cr);
1203 if XNN_LIKELY(b != NULL) {
1204 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1205 *packed_w++ = b[cr_block_start + cr_block_offset];
1206 }
1207 } else {
1208 size_t n = cr_block_size;
1209 do {
1210 *packed_w++ = 0.0f;
1211 } while (--n != 0);
1212 }
1213 packed_w += cr - cr_block_size;
1214 for (size_t x = 0; x < w; x++) {
1215 for (size_t y = 0; y < h; y++) {
1216 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1217 const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1218 *packed_w++ = kv;
1219 }
1220 packed_w += cr - cr_block_size;
1221 }
1222 }
1223 }
1224}
1225
1226void xnn_pack_f16_dwconv_hwg_w(
1227 size_t h,
1228 size_t w,
1229 size_t c,
1230 size_t cr,
1231 const uint16_t* k,
1232 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001233 uint16_t* packed_w,
1234 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001235{
1236 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1237 const size_t cr_block_size = min(c - cr_block_start, cr);
1238 if XNN_LIKELY(b != NULL) {
1239 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1240 *packed_w++ = b[cr_block_start + cr_block_offset];
1241 }
1242 } else {
1243 size_t n = cr_block_size;
1244 do {
1245 *packed_w++ = 0;
1246 } while (--n != 0);
1247 }
1248 packed_w += cr - cr_block_size;
1249 for (size_t x = 0; x < w; x++) {
1250 for (size_t y = 0; y < h; y++) {
1251 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1252 const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1253 *packed_w++ = kv;
1254 }
1255 packed_w += cr - cr_block_size;
1256 }
1257 }
1258 }
1259}
1260
Marat Dukhan08b7a972020-07-14 18:17:29 -07001261void xnn_pack_qu8_dwconv_hwg_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001262 size_t h,
1263 size_t w,
1264 size_t c,
1265 size_t cr,
Marat Dukhanab582382020-07-06 13:32:08 -07001266 const uint8_t* k,
1267 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001268 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001269 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001270{
Marat Dukhanb42f8662020-07-06 20:46:13 -07001271 const int32_t izp = (int32_t) params->input_zero_point;
1272 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001273 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1274 const size_t cr_block_size = min(c - cr_block_start, cr);
1275 int32_t* packed_b = (int32_t*) packed_w;
1276 if XNN_LIKELY(b != NULL) {
1277 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1278 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1279 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1280 }
1281 } else {
1282 size_t n = cr_block_size;
1283 do {
1284 *((int32_t*) packed_w) = boff;
1285 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1286 } while (--n != 0);
1287 }
1288 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1289 for (size_t x = 0; x < w; x++) {
1290 for (size_t y = 0; y < h; y++) {
1291 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1292 const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
Marat Dukhanb42f8662020-07-06 20:46:13 -07001293 packed_b[cr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001294 *((uint8_t*) packed_w) = kv;
1295 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1296 }
1297 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1298 }
1299 }
1300 }
1301}
1302
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001303void xnn_pack_qs8_dwconv_hwg_w(
1304 size_t h,
1305 size_t w,
1306 size_t c,
1307 size_t cr,
1308 const int8_t* k,
1309 const int32_t* b,
1310 void* packed_w,
1311 const struct xnn_qs8_packing_params* params)
1312{
1313 const int32_t izp = (int32_t) params->input_zero_point;
1314 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1315 const size_t cr_block_size = min(c - cr_block_start, cr);
1316 int32_t* packed_b = (int32_t*) packed_w;
1317 if XNN_LIKELY(b != NULL) {
1318 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1319 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
1320 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1321 }
1322 } else {
1323 size_t n = cr_block_size;
1324 do {
1325 *((int32_t*) packed_w) = 0;
1326 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1327 } while (--n != 0);
1328 }
1329 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1330 for (size_t x = 0; x < w; x++) {
1331 for (size_t y = 0; y < h; y++) {
1332 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1333 const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1334 packed_b[cr_block_offset] -= (int32_t) kv * izp;
1335 *((int8_t*) packed_w) = kv;
1336 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1337 }
1338 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1339 }
1340 }
1341 }
1342}
1343
Marat Dukhana6879bd2020-07-06 14:25:08 -07001344void xnn_pack_f32_gemminc_goi_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001345 size_t g,
1346 size_t nc,
1347 size_t kc,
1348 size_t nr,
1349 size_t kr,
1350 size_t sr,
Marat Dukhana6879bd2020-07-06 14:25:08 -07001351 const float* k,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001352 float* packed_w,
1353 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001354{
1355 const size_t skr = sr * kr;
1356 const size_t skc = round_down_po2(kc, skr);
1357 const size_t sr_mask = (sr - 1) * kr;
1358 do {
1359 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1360 const size_t nr_block_size = min(nc - nr_block_start, nr);
Marat Dukhanab582382020-07-06 13:32:08 -07001361
1362 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1363 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1364 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1365 *packed_w++ =
1366 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1367 }
1368 }
1369 packed_w += (nr - nr_block_size) * kr;
1370 }
1371
1372 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1373 const size_t kr_block_size = min(kc - kr_block_start, kr);
1374 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1375 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1376 *packed_w++ =
1377 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1378 }
1379 packed_w += kr - kr_block_size;
1380 }
1381 packed_w += (nr - nr_block_size) * kr;
1382 }
1383 }
1384 k += nc * kc;
Marat Dukhanab582382020-07-06 13:32:08 -07001385 } while (--g != 0);
1386}
1387
Marat Dukhanab582382020-07-06 13:32:08 -07001388void xnn_pack_f16_gemminc_goi_w(
1389 size_t g,
1390 size_t nc,
1391 size_t kc,
1392 size_t nr,
1393 size_t kr,
1394 size_t sr,
1395 const uint16_t* k,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001396 uint16_t* packed_w,
1397 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001398{
1399 const size_t skr = sr * kr;
1400 const size_t skc = round_down_po2(kc, skr);
1401 const size_t sr_mask = (sr - 1) * kr;
1402 do {
1403 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1404 const size_t nr_block_size = min(nc - nr_block_start, nr);
1405
1406 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1407 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1408 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1409 *packed_w++ =
1410 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1411 }
1412 }
1413 packed_w += (nr - nr_block_size) * kr;
1414 }
1415
1416 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1417 const size_t kr_block_size = min(kc - kr_block_start, kr);
1418 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1419 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1420 *packed_w++ =
1421 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1422 }
1423 packed_w += kr - kr_block_size;
1424 }
1425 packed_w += (nr - nr_block_size) * kr;
1426 }
1427 }
1428 k += nc * kc;
1429 } while (--g != 0);
1430}
1431
Marat Dukhana6879bd2020-07-06 14:25:08 -07001432void xnn_pack_f32_dconv_oki_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001433 size_t nc,
Marat Dukhanab582382020-07-06 13:32:08 -07001434 size_t kc,
1435 size_t nr,
Marat Dukhana6879bd2020-07-06 14:25:08 -07001436 size_t kh,
1437 size_t kw,
1438 const float* k,
1439 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001440 float* packed_w,
1441 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001442{
Marat Dukhana6879bd2020-07-06 14:25:08 -07001443 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1444 const size_t nr_block_size = min(nc - nr_block_start, nr);
1445 if XNN_LIKELY(b != NULL) {
1446 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1447 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
Marat Dukhanab582382020-07-06 13:32:08 -07001448 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001449 } else {
1450 size_t n = nr;
1451 do {
1452 *packed_w++ = 0.0f;
1453 } while (--n != 0);
1454 }
Marat Dukhanab582382020-07-06 13:32:08 -07001455
Marat Dukhana6879bd2020-07-06 14:25:08 -07001456 for (size_t kx = 0; kx < kw; kx++) {
1457 for (size_t c = 0; c < kc; c++) {
1458 for (size_t ky = 0; ky < kh; ky++) {
1459 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1460 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
Marat Dukhanab582382020-07-06 13:32:08 -07001461 }
Marat Dukhanab582382020-07-06 13:32:08 -07001462 }
1463 }
1464 }
Marat Dukhanab582382020-07-06 13:32:08 -07001465 if XNN_UNPREDICTABLE(b != NULL) {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001466 b += nr;
Marat Dukhanab582382020-07-06 13:32:08 -07001467 }
1468 }
1469}
1470
1471void xnn_pack_f16_dconv_oki_w(
1472 size_t nc,
1473 size_t kc,
1474 size_t nr,
1475 size_t kh,
1476 size_t kw,
1477 const uint16_t* k,
1478 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001479 uint16_t* packed_w,
1480 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001481{
1482 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1483 const size_t nr_block_size = min(nc - nr_block_start, nr);
1484 if XNN_LIKELY(b != NULL) {
1485 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1486 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1487 }
1488 } else {
1489 size_t n = nr;
1490 do {
1491 *packed_w++ = 0;
1492 } while (--n != 0);
1493 }
1494
1495 for (size_t kx = 0; kx < kw; kx++) {
1496 for (size_t c = 0; c < kc; c++) {
1497 for (size_t ky = 0; ky < kh; ky++) {
1498 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1499 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1500 }
1501 }
1502 }
1503 }
1504 if XNN_UNPREDICTABLE(b != NULL) {
1505 b += nr;
1506 }
1507 }
1508}
1509
Marat Dukhana6879bd2020-07-06 14:25:08 -07001510void xnn_pack_f32_chw_dwconv_ghw_w(
1511 size_t kernel_size,
1512 size_t groups,
1513 const float* kernel,
1514 const float* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001515 float* packed_weights,
1516 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001517{
Marat Dukhana6879bd2020-07-06 14:25:08 -07001518 for (size_t g = 0; g < groups; g++) {
1519 if XNN_LIKELY(bias != NULL) {
1520 *packed_weights = *bias++;
Marat Dukhanab582382020-07-06 13:32:08 -07001521 } else {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001522 *packed_weights = 0.0f;
Marat Dukhanab582382020-07-06 13:32:08 -07001523 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001524 packed_weights += 1;
1525 for (size_t i = 0; i < kernel_size; i++) {
1526 *packed_weights++ = kernel[g * kernel_size + i];
Marat Dukhanab582382020-07-06 13:32:08 -07001527 }
1528 }
1529}
1530
1531void xnn_pack_f16_chw_dwconv_ghw_w(
1532 size_t kernel_size,
1533 size_t groups,
1534 const uint16_t* kernel,
1535 const uint16_t* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001536 uint16_t* packed_weights,
1537 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001538{
1539 for (size_t g = 0; g < groups; g++) {
1540 if XNN_LIKELY(bias != NULL) {
1541 *packed_weights = *bias++;
1542 } else {
1543 *packed_weights = 0;
1544 }
1545 packed_weights += 1;
1546 for (size_t i = 0; i < kernel_size; i++) {
1547 *packed_weights++ = kernel[g * kernel_size + i];
1548 }
1549 }
1550}
1551
Marat Dukhanab582382020-07-06 13:32:08 -07001552void xnn_pack_f32_chw_dwconv_hwg_w(
1553 size_t kernel_size,
1554 size_t groups,
1555 const float* kernel,
1556 const float* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001557 float* packed_weights,
1558 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001559{
1560 for (size_t g = 0; g < groups; g++) {
1561 if XNN_LIKELY(bias != NULL) {
1562 *packed_weights = *bias++;
1563 } else {
1564 *packed_weights = 0.0f;
1565 }
1566 packed_weights += 1;
1567 for (size_t i = 0; i < kernel_size; i++) {
1568 *packed_weights++ = kernel[i * groups + g];
1569 }
1570 }
1571}
1572
1573void xnn_pack_f32_vmulcaddc_w(
1574 size_t c,
1575 size_t cr,
1576 const float* s,
1577 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001578 float* packed_w,
1579 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001580{
1581 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1582 const size_t cr_block_size = min(c - cr_block_start, cr);
1583 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1584 *packed_w++ = s[cr_block_start + cr_block_offset];
1585 }
1586 packed_w += cr - cr_block_size;
1587 if XNN_LIKELY(b != NULL) {
1588 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1589 *packed_w++ = b[cr_block_start + cr_block_offset];
1590 }
1591 } else {
1592 size_t n = cr_block_size;
1593 do {
1594 *packed_w++ = 0.0f;
1595 } while (--n != 0);
1596 }
1597 packed_w += cr - cr_block_size;
1598 }
1599}
1600
1601void xnn_pack_f16_vmulcaddc_w(
1602 size_t c,
1603 size_t cr,
1604 const uint16_t* s,
1605 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001606 uint16_t* packed_w,
1607 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001608{
1609 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1610 const size_t cr_block_size = min(c - cr_block_start, cr);
1611 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1612 *packed_w++ = s[cr_block_start + cr_block_offset];
1613 }
1614 packed_w += cr - cr_block_size;
1615 if XNN_LIKELY(b != NULL) {
1616 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1617 *packed_w++ = b[cr_block_start + cr_block_offset];
1618 }
1619 } else {
1620 size_t n = cr_block_size;
1621 do {
1622 *packed_w++ = 0;
1623 } while (--n != 0);
1624 }
1625 packed_w += cr - cr_block_size;
1626 }
1627}