blob: 2e232a4b310a92bbb493f75c20732c227399ff53 [file] [log] [blame]
Marat Dukhanab582382020-07-06 13:32:08 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <stdint.h>
10#include <stddef.h>
11
12#include <xnnpack/math.h>
13#include <xnnpack/pack.h>
14
15
Marat Dukhana6879bd2020-07-06 14:25:08 -070016void xnn_pack_f32_gemm_goi_w(
17 size_t g,
18 size_t nc,
19 size_t kc,
20 size_t nr,
21 size_t kr,
22 size_t sr,
23 const float* k,
24 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -070025 float* packed_w,
26 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -070027{
28 const size_t skr = sr * kr;
29 const size_t skc = round_down_po2(kc, skr);
30 const size_t sr_mask = (sr - 1) * kr;
31 do {
32 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
33 const size_t nr_block_size = min(nc - nr_block_start, nr);
34 if XNN_LIKELY(b != NULL) {
35 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
36 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
37 }
38 }
39 packed_w += nr;
40
41 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
42 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
43 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
44 *packed_w++ =
45 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
46 }
47 }
48 packed_w += (nr - nr_block_size) * kr;
49 }
50
51 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
52 const size_t kr_block_size = min(kc - kr_block_start, kr);
53 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
54 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
55 *packed_w++ =
56 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
57 }
58 packed_w += kr - kr_block_size;
59 }
60 packed_w += (nr - nr_block_size) * kr;
61 }
62 }
63 k += nc * kc;
64 if XNN_UNPREDICTABLE(b != NULL) {
65 b += nc;
66 }
67 } while (--g != 0);
68}
69
70void xnn_pack_f16_gemm_goi_w(
71 size_t g,
72 size_t nc,
73 size_t kc,
74 size_t nr,
75 size_t kr,
76 size_t sr,
77 const uint16_t* k,
78 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -070079 uint16_t* packed_w,
80 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -070081{
82 const size_t skr = sr * kr;
83 const size_t skc = round_down_po2(kc, skr);
84 const size_t sr_mask = (sr - 1) * kr;
85 do {
86 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
87 const size_t nr_block_size = min(nc - nr_block_start, nr);
88 if XNN_LIKELY(b != NULL) {
89 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
90 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
91 }
92 }
93 packed_w += nr;
94
95 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
96 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
97 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
98 *packed_w++ =
99 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
100 }
101 }
102 packed_w += (nr - nr_block_size) * kr;
103 }
104
105 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
106 const size_t kr_block_size = min(kc - kr_block_start, kr);
107 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
108 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
109 *packed_w++ =
110 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
111 }
112 packed_w += kr - kr_block_size;
113 }
114 packed_w += (nr - nr_block_size) * kr;
115 }
116 }
117 k += nc * kc;
118 if XNN_UNPREDICTABLE(b != NULL) {
119 b += nc;
120 }
121 } while (--g != 0);
122}
123
Marat Dukhan08b7a972020-07-14 18:17:29 -0700124void xnn_pack_qu8_gemm_goi_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700125 size_t g,
126 size_t nc,
127 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700128 size_t nr,
129 size_t kr,
Marat Dukhanbc08f312020-07-07 16:22:04 -0700130 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700131 const uint8_t* k,
132 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700133 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700134 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700135{
Marat Dukhanbc08f312020-07-07 16:22:04 -0700136 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700137 const int32_t izp = (int32_t) params->input_zero_point;
138 const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700139 do {
140 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
141 const size_t nr_block_size = min(nc - nr_block_start, nr);
142 int32_t* packed_b = (int32_t*) packed_w;
143 if XNN_LIKELY(b != NULL) {
144 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
145 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
146 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
147 }
148 } else {
149 size_t n = nr_block_size;
150 do {
151 *((int32_t*) packed_w) = boff;
152 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
153 } while (--n != 0);
154 }
155 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
156 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
157 const size_t kr_block_size = min(kc - kr_block_start, kr);
158 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
159 int32_t ksum = 0;
160 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
161 const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
162 ksum += (int32_t) kv;
163 *((uint8_t*) packed_w) = kv;
164 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
165 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700166 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700167 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
168 }
169 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
170 }
171 }
172 k += nc * kc;
173 if XNN_UNPREDICTABLE(b != NULL) {
174 b += nc;
175 }
176 } while (--g != 0);
177}
178
Marat Dukhan595e1702020-07-31 10:12:52 -0700179void xnn_pack_qs8_gemm_goi_w(
180 size_t g,
181 size_t nc,
182 size_t kc,
183 size_t nr,
184 size_t kr,
185 size_t sr,
186 const int8_t* k,
187 const int32_t* b,
188 void* packed_w,
189 const struct xnn_qs8_packing_params* params)
190{
191 assert(sr == 1);
192 const int32_t izp = (int32_t) params->input_zero_point;
193 do {
194 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
195 const size_t nr_block_size = min(nc - nr_block_start, nr);
196 int32_t* packed_b = (int32_t*) packed_w;
197 if XNN_LIKELY(b != NULL) {
198 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
199 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
200 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
201 }
202 } else {
203 size_t n = nr_block_size;
204 do {
205 *((int32_t*) packed_w) = 0;
206 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
207 } while (--n != 0);
208 }
209 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
210 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
211 const size_t kr_block_size = min(kc - kr_block_start, kr);
212 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
213 int32_t ksum = 0;
214 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
215 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
216 ksum += (int32_t) kv;
217 *((int8_t*) packed_w) = kv;
218 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
219 }
220 packed_b[nr_block_offset] -= ksum * izp;
221 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
222 }
223 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
224 }
225 }
226 k += nc * kc;
227 if XNN_UNPREDICTABLE(b != NULL) {
228 b += nc;
229 }
230 } while (--g != 0);
231}
232
Marat Dukhan683fab32020-08-03 19:42:52 -0700233void xnn_pack_qs8_gemm_xw_goi_w(
234 size_t g,
235 size_t nc,
236 size_t kc,
237 size_t nr,
238 size_t kr,
239 size_t sr,
240 const int8_t* k,
241 const int32_t* b,
242 void* packed_w,
243 const struct xnn_qs8_packing_params* params)
244{
245 assert(sr == 1);
246 const int32_t izp = (int32_t) params->input_zero_point;
247 do {
248 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
249 const size_t nr_block_size = min(nc - nr_block_start, nr);
250 int32_t* packed_b = (int32_t*) packed_w;
251 if XNN_LIKELY(b != NULL) {
252 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
253 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
254 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
255 }
256 } else {
257 size_t n = nr_block_size;
258 do {
259 *((int32_t*) packed_w) = 0;
260 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
261 } while (--n != 0);
262 }
263 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
264 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
265 const size_t kr_block_size = min(kc - kr_block_start, kr);
266 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
267 int32_t ksum = 0;
268 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
269 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
270 ksum += (int32_t) kv;
271 *((int16_t*) packed_w) = (int16_t) kv;
272 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int16_t));
273 }
274 packed_b[nr_block_offset] -= ksum * izp;
275 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int16_t));
276 }
277 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int16_t));
278 }
279 }
280 k += nc * kc;
281 if XNN_UNPREDICTABLE(b != NULL) {
282 b += nc;
283 }
284 } while (--g != 0);
285}
286
Marat Dukhana6879bd2020-07-06 14:25:08 -0700287void xnn_pack_f32_gemm_io_w(
288 size_t nc,
289 size_t kc,
290 size_t nr,
291 size_t kr,
292 size_t sr,
293 const float* k,
294 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700295 float* packed_w,
296 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700297{
298 const size_t skr = sr * kr;
299 const size_t skc = round_down_po2(kc, skr);
300 const size_t sr_mask = (sr - 1) * kr;
301 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
302 const size_t nr_block_size = min(nc - nr_block_start, nr);
303 if XNN_LIKELY(b != NULL) {
304 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
305 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
306 }
307 }
308 packed_w += nr;
309
310 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
311 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
312 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
313 *packed_w++ =
314 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
315 }
316 }
317 packed_w += (nr - nr_block_size) * kr;
318 }
319
320 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
321 const size_t kr_block_size = min(kc - kr_block_start, kr);
322 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
323 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
324 *packed_w++ =
325 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
326 }
327 packed_w += kr - kr_block_size;
328 }
329 packed_w += (nr - nr_block_size) * kr;
330 }
331 }
332}
333
334void xnn_pack_f16_gemm_io_w(
335 size_t nc,
336 size_t kc,
337 size_t nr,
338 size_t kr,
339 size_t sr,
340 const uint16_t* k,
341 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700342 uint16_t* packed_w,
343 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700344{
345 const size_t skr = sr * kr;
346 const size_t skc = round_down_po2(kc, skr);
347 const size_t sr_mask = (sr - 1) * kr;
348 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
349 const size_t nr_block_size = min(nc - nr_block_start, nr);
350 if XNN_LIKELY(b != NULL) {
351 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
352 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
353 }
354 }
355 packed_w += nr;
356
357 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
358 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
359 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
360 *packed_w++ =
361 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
362 }
363 }
364 packed_w += (nr - nr_block_size) * kr;
365 }
366
367 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
368 const size_t kr_block_size = min(kc - kr_block_start, kr);
369 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
370 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
371 *packed_w++ =
372 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
373 }
374 packed_w += kr - kr_block_size;
375 }
376 packed_w += (nr - nr_block_size) * kr;
377 }
378 }
379}
380
Marat Dukhan08b7a972020-07-14 18:17:29 -0700381void xnn_pack_qu8_gemm_io_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700382 size_t nc,
383 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700384 size_t nr,
385 size_t kr,
Marat Dukhanbc08f312020-07-07 16:22:04 -0700386 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700387 const uint8_t* k,
388 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700389 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700390 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700391{
Marat Dukhanbc08f312020-07-07 16:22:04 -0700392 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700393 const int32_t izp = (int32_t) params->input_zero_point;
394 const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700395 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
396 const size_t nr_block_size = min(nc - nr_block_start, nr);
397 int32_t* packed_b = (int32_t*) packed_w;
398 if XNN_LIKELY(b != NULL) {
399 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
400 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
401 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
402 }
403 } else {
404 size_t n = nr_block_size;
405 do {
406 *((int32_t*) packed_w) = boff;
407 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
408 } while (--n != 0);
409 }
410 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
411 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
412 const size_t kr_block_size = min(kc - kr_block_start, kr);
413 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
414 int32_t ksum = 0;
415 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
416 const uint8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
417 ksum += (int32_t) kv;
418 *((uint8_t*) packed_w) = kv;
419 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
420 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700421 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700422 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
423 }
424 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
425 }
426 }
427}
428
Marat Dukhand23cb6e2021-04-01 01:18:58 -0700429void xnn_pack_qs8_gemm_io_w(
430 size_t nc,
431 size_t kc,
432 size_t nr,
433 size_t kr,
434 size_t sr,
435 const int8_t* k,
436 const int32_t* b,
437 void* packed_w,
438 const struct xnn_qs8_packing_params* params)
439{
440 assert(sr == 1);
441 const int32_t izp = (int32_t) params->input_zero_point;
442 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
443 const size_t nr_block_size = min(nc - nr_block_start, nr);
444 int32_t* packed_b = (int32_t*) packed_w;
445 if XNN_LIKELY(b != NULL) {
446 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
447 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
448 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
449 }
450 } else {
451 size_t n = nr_block_size;
452 do {
453 *((int32_t*) packed_w) = 0;
454 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
455 } while (--n != 0);
456 }
457 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
458 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
459 const size_t kr_block_size = min(kc - kr_block_start, kr);
460 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
461 int32_t ksum = 0;
462 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
463 const int8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
464 ksum += (int32_t) kv;
465 *((int8_t*) packed_w) = kv;
466 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
467 }
468 packed_b[nr_block_offset] -= ksum * izp;
469 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
470 }
471 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
472 }
473 }
474}
475
Marat Dukhana6879bd2020-07-06 14:25:08 -0700476void xnn_pack_f32_conv_goki_w(
477 size_t g,
478 size_t nc,
479 size_t ks,
480 size_t kc,
481 size_t nr,
482 size_t kr,
483 size_t sr,
484 const float* k,
485 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700486 float* packed_w,
487 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700488{
489 const size_t skr = sr * kr;
490 const size_t skc = round_down_po2(kc, skr);
491 const size_t sr_mask = (sr - 1) * kr;
492 do {
493 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
494 const size_t nr_block_size = min(nc - nr_block_start, nr);
495 if XNN_LIKELY(b != NULL) {
496 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
497 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
498 }
499 }
500 packed_w += nr;
501
502 for (size_t ki = 0; ki < ks; ki++) {
503 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
504 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
505 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
506 *packed_w++ =
507 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
508 }
509 }
510 packed_w += (nr - nr_block_size) * kr;
511 }
512
513 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
514 const size_t kr_block_size = min(kc - kr_block_start, kr);
515 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
516 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
517 *packed_w++ =
518 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
519 }
520 packed_w += kr - kr_block_size;
521 }
522 packed_w += (nr - nr_block_size) * kr;
523 }
524 }
525 }
526 k += ks * kc * nc;
527 if XNN_UNPREDICTABLE(b != NULL) {
528 b += nc;
529 }
530 } while (--g != 0);
531}
532
533void xnn_pack_f16_conv_goki_w(
534 size_t g,
535 size_t nc,
536 size_t ks,
537 size_t kc,
538 size_t nr,
539 size_t kr,
540 size_t sr,
541 const uint16_t* k,
542 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700543 uint16_t* packed_w,
544 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700545{
546 const size_t skr = sr * kr;
547 const size_t skc = round_down_po2(kc, skr);
548 const size_t sr_mask = (sr - 1) * kr;
549 do {
550 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
551 const size_t nr_block_size = min(nc - nr_block_start, nr);
552 if XNN_LIKELY(b != NULL) {
553 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
554 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
555 }
556 }
557 packed_w += nr;
558
559 for (size_t ki = 0; ki < ks; ki++) {
560 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
561 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
562 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
563 *packed_w++ =
564 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
565 }
566 }
567 packed_w += (nr - nr_block_size) * kr;
568 }
569
570 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
571 const size_t kr_block_size = min(kc - kr_block_start, kr);
572 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
573 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
574 *packed_w++ =
575 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
576 }
577 packed_w += kr - kr_block_size;
578 }
579 packed_w += (nr - nr_block_size) * kr;
580 }
581 }
582 }
583 k += ks * kc * nc;
584 if XNN_UNPREDICTABLE(b != NULL) {
585 b += nc;
586 }
587 } while (--g != 0);
588}
589
Marat Dukhan08b7a972020-07-14 18:17:29 -0700590void xnn_pack_qu8_conv_goki_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700591 size_t g,
592 size_t nc,
593 size_t ks,
594 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700595 size_t nr,
596 size_t kr,
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700597 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700598 const uint8_t* k,
599 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700600 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700601 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700602{
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700603 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700604 const int32_t izp = (int32_t) params->input_zero_point;
605 const int32_t boff = (int32_t) ks * (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700606 do {
607 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
608 const size_t nr_block_size = min(nc - nr_block_start, nr);
609 int32_t* packed_b = (int32_t*) packed_w;
610 if XNN_LIKELY(b != NULL) {
611 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
612 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
613 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
614 }
615 } else {
616 size_t n = nr_block_size;
617 do {
618 *((int32_t*) packed_w) = boff;
619 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
620 } while (--n != 0);
621 }
622 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
623 for (size_t ki = 0; ki < ks; ki++) {
624 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
625 const size_t kr_block_size = min(kc - kr_block_start, kr);
626 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
627 int32_t ksum = 0;
628 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
629 const uint8_t kv =
630 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
631 ksum += (int32_t) kv;
632 *((uint8_t*) packed_w) = kv;
633 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
634 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700635 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700636 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
637 }
638 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
639 }
640 }
641 }
642 k += ks * kc * nc;
643 if XNN_UNPREDICTABLE(b != NULL) {
644 b += nc;
645 }
646 } while (--g != 0);
647}
648
Marat Dukhanf9480682020-07-31 14:50:24 -0700649void xnn_pack_qs8_conv_goki_w(
650 size_t g,
651 size_t nc,
652 size_t ks,
653 size_t kc,
654 size_t nr,
655 size_t kr,
656 size_t sr,
657 const int8_t* k,
658 const int32_t* b,
659 void* packed_w,
660 const struct xnn_qs8_packing_params* params)
661{
662 assert(sr == 1);
663 const int32_t izp = (int32_t) params->input_zero_point;
664 do {
665 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
666 const size_t nr_block_size = min(nc - nr_block_start, nr);
667 int32_t* packed_b = (int32_t*) packed_w;
668 if XNN_LIKELY(b != NULL) {
669 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
670 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
671 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
672 }
673 } else {
674 size_t n = nr_block_size;
675 do {
676 *((int32_t*) packed_w) = 0;
677 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
678 } while (--n != 0);
679 }
680 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
681 for (size_t ki = 0; ki < ks; ki++) {
682 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
683 const size_t kr_block_size = min(kc - kr_block_start, kr);
684 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
685 int32_t ksum = 0;
686 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
687 const int8_t kv =
688 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
689 ksum += (int32_t) kv;
690 *((int8_t*) packed_w) = kv;
691 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
692 }
693 packed_b[nr_block_offset] -= ksum * izp;
694 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
695 }
696 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
697 }
698 }
699 }
700 k += ks * kc * nc;
701 if XNN_UNPREDICTABLE(b != NULL) {
702 b += nc;
703 }
704 } while (--g != 0);
705}
706
Marat Dukhana6879bd2020-07-06 14:25:08 -0700707void xnn_pack_f32_conv_kgo_w(
708 size_t g,
709 size_t nc,
710 size_t ks,
711 size_t nr,
712 size_t kr,
713 const float* k,
714 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700715 float* packed_w,
716 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700717{
718 for (size_t i = 0; i < g; i++) {
719 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
720 const size_t nr_block_size = min(nc - nr_block_start, nr);
721 if XNN_LIKELY(b != NULL) {
722 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
723 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
724 }
725 }
726 packed_w += nr;
727 for (size_t ki = 0; ki < ks; ki++) {
728 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
729 *packed_w =
730 k[ki * g * nc + (nr_block_start + nr_block_offset)];
731 packed_w += kr;
732 }
733 packed_w += (nr - nr_block_size) * kr;
734 }
735 }
736 k += nc;
737 if XNN_UNPREDICTABLE(b != NULL) {
738 b += nc;
739 }
740 }
741}
742
743void xnn_pack_f16_conv_kgo_w(
744 size_t g,
745 size_t nc,
746 size_t ks,
747 size_t nr,
748 size_t kr,
749 const uint16_t* k,
750 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700751 uint16_t* packed_w,
752 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700753{
754 for (size_t i = 0; i < g; i++) {
755 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
756 const size_t nr_block_size = min(nc - nr_block_start, nr);
757 if XNN_LIKELY(b != NULL) {
758 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
759 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
760 }
761 }
762 packed_w += nr;
763 for (size_t ki = 0; ki < ks; ki++) {
764 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
765 *packed_w =
766 k[ki * g * nc + (nr_block_start + nr_block_offset)];
767 packed_w += kr;
768 }
769 packed_w += (nr - nr_block_size) * kr;
770 }
771 }
772 k += nc;
773 if XNN_UNPREDICTABLE(b != NULL) {
774 b += nc;
775 }
776 }
777}
778
Marat Dukhan08b7a972020-07-14 18:17:29 -0700779void xnn_pack_qu8_conv_kgo_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700780 size_t g,
781 size_t nc,
782 size_t ks,
Marat Dukhana6879bd2020-07-06 14:25:08 -0700783 size_t nr,
784 size_t kr,
Marat Dukhanab582382020-07-06 13:32:08 -0700785 const uint8_t* k,
786 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700787 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700788 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700789{
Marat Dukhanb42f8662020-07-06 20:46:13 -0700790 const int32_t izp = (int32_t) params->input_zero_point;
791 const int32_t boff = (int32_t) ks * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700792 for (size_t i = 0; i < g; i++) {
793 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
794 const size_t nr_block_size = min(nc - nr_block_start, nr);
795 int32_t* packed_b = (int32_t*) packed_w;
796 if XNN_LIKELY(b != NULL) {
797 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
798 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
799 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
800 }
801 } else {
802 size_t n = nr_block_size;
803 do {
804 *((int32_t*) packed_w) = boff;
805 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
806 } while (--n != 0);
807 }
808 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
809 for (size_t ki = 0; ki < ks; ki++) {
810 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
811 const uint8_t kv =
812 k[ki * g * nc + (nr_block_start + nr_block_offset)];
813 *((uint8_t*) packed_w) = kv;
Marat Dukhanb42f8662020-07-06 20:46:13 -0700814 packed_b[nr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700815 packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(uint8_t));
816 }
817 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
818 }
819 }
820 k += nc;
821 if XNN_UNPREDICTABLE(b != NULL) {
822 b += nc;
823 }
824 }
825}
826
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700827void xnn_pack_qs8_conv_kgo_w(
828 size_t g,
829 size_t nc,
830 size_t ks,
831 size_t nr,
832 size_t kr,
833 const int8_t* k,
834 const int32_t* b,
835 void* packed_w,
836 const struct xnn_qs8_packing_params* params)
837{
838 const int32_t izp = (int32_t) params->input_zero_point;
839 for (size_t i = 0; i < g; i++) {
840 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
841 const size_t nr_block_size = min(nc - nr_block_start, nr);
842 int32_t* packed_b = (int32_t*) packed_w;
843 if XNN_LIKELY(b != NULL) {
844 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
845 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
846 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
847 }
848 } else {
849 size_t n = nr_block_size;
850 do {
851 *((int32_t*) packed_w) = 0;
852 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
853 } while (--n != 0);
854 }
855 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
856 for (size_t ki = 0; ki < ks; ki++) {
857 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
858 const int8_t kv =
859 k[ki * g * nc + (nr_block_start + nr_block_offset)];
860 *((int8_t*) packed_w) = kv;
861 packed_b[nr_block_offset] -= (int32_t) kv * izp;
862 packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(int8_t));
863 }
864 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
865 }
866 }
867 k += nc;
868 if XNN_UNPREDICTABLE(b != NULL) {
869 b += nc;
870 }
871 }
872}
873
Marat Dukhana6879bd2020-07-06 14:25:08 -0700874void xnn_pack_f32_deconv_goki_w(
875 size_t g,
876 size_t nc,
877 size_t kh,
878 size_t kw,
879 size_t kc,
880 size_t sh,
881 size_t sw,
882 size_t nr,
883 size_t kr,
884 size_t sr,
885 const float* k,
886 const float* b,
887 float* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700888 struct subconvolution_params* subconv_params,
889 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700890{
891 const size_t skr = sr * kr;
892 const size_t skc = round_down_po2(kc, skr);
893 const size_t sr_mask = (sr - 1) * kr;
894 for (size_t i = 0; i < g; i++) {
895 for (size_t oy = 0; oy < sh; oy++) {
896 for (size_t ox = 0; ox < sw; ox++) {
897 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -0700898 (*subconv_params++).weights = packed_w;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700899 }
900 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
901 const size_t nr_block_size = min(nc - nr_block_start, nr);
902 if XNN_LIKELY(b != NULL) {
903 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
904 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
905 }
906 }
907 packed_w += nr;
908 for (size_t ky = oy; ky < kh; ky += sh) {
909 for (size_t kx = ox; kx < kw; kx += sw) {
910 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
911 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
912 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
913 *packed_w++ =
914 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
915 }
916 }
917 packed_w += (nr - nr_block_size) * kr;
918 }
919
920 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
921 const size_t kr_block_size = min(kc - kr_block_start, kr);
922 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
923 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
924 *packed_w++ =
925 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
926 }
927 packed_w += kr - kr_block_size;
928 }
929 packed_w += (nr - nr_block_size) * kr;
930 }
931 }
932 }
933 }
934 }
935 }
936 k += kh * kw * kc * nc;
937 if XNN_UNPREDICTABLE(b != NULL) {
938 b += nc;
939 }
940 }
941}
942
943void xnn_pack_f16_deconv_goki_w(
944 size_t g,
945 size_t nc,
946 size_t kh,
947 size_t kw,
948 size_t kc,
949 size_t sh,
950 size_t sw,
951 size_t nr,
952 size_t kr,
953 size_t sr,
954 const uint16_t* k,
955 const uint16_t* b,
956 uint16_t* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700957 struct subconvolution_params* subconv_params,
958 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700959{
960 const size_t skr = sr * kr;
961 const size_t skc = round_down_po2(kc, skr);
962 const size_t sr_mask = (sr - 1) * kr;
963 for (size_t i = 0; i < g; i++) {
964 for (size_t oy = 0; oy < sh; oy++) {
965 for (size_t ox = 0; ox < sw; ox++) {
966 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -0700967 (*subconv_params++).weights = packed_w;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700968 }
969 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
970 const size_t nr_block_size = min(nc - nr_block_start, nr);
971 if XNN_LIKELY(b != NULL) {
972 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
973 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
974 }
975 }
976 packed_w += nr;
977 for (size_t ky = oy; ky < kh; ky += sh) {
978 for (size_t kx = ox; kx < kw; kx += sw) {
979 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
980 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
981 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
982 *packed_w++ =
983 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
984 }
985 }
986 packed_w += (nr - nr_block_size) * kr;
987 }
988
989 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
990 const size_t kr_block_size = min(kc - kr_block_start, kr);
991 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
992 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
993 *packed_w++ =
994 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
995 }
996 packed_w += kr - kr_block_size;
997 }
998 packed_w += (nr - nr_block_size) * kr;
999 }
1000 }
1001 }
1002 }
1003 }
1004 }
1005 k += kh * kw * kc * nc;
1006 if XNN_UNPREDICTABLE(b != NULL) {
1007 b += nc;
1008 }
1009 }
1010}
1011
Marat Dukhan08b7a972020-07-14 18:17:29 -07001012void xnn_pack_qu8_deconv_goki_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001013 size_t g,
1014 size_t nc,
1015 size_t kh,
1016 size_t kw,
1017 size_t kc,
1018 size_t sh,
1019 size_t sw,
1020 size_t nr,
1021 size_t kr,
Marat Dukhan5a698bb2020-07-07 20:47:55 -07001022 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -07001023 const uint8_t* k,
1024 const int32_t* b,
1025 void* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001026 struct subconvolution_params* subconv_params,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001027 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001028{
Marat Dukhan5a698bb2020-07-07 20:47:55 -07001029 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -07001030 const int32_t izp = (int32_t) params->input_zero_point;
1031 const int32_t kzp = (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001032 for (size_t i = 0; i < g; i++) {
1033 for (size_t oy = 0; oy < sh; oy++) {
1034 for (size_t ox = 0; ox < sw; ox++) {
1035 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -07001036 (*subconv_params++).weights = packed_w;
Marat Dukhanab582382020-07-06 13:32:08 -07001037 }
Marat Dukhanb42f8662020-07-06 20:46:13 -07001038 const int32_t boff = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * izp * kzp;
Marat Dukhanab582382020-07-06 13:32:08 -07001039 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1040 const size_t nr_block_size = min(nc - nr_block_start, nr);
1041 int32_t* packed_b = (int32_t*) packed_w;
1042 if XNN_LIKELY(b != 0) {
1043 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1044 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
1045 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1046 }
1047 } else {
1048 size_t n = nr_block_size;
1049 do {
1050 *((int32_t*) packed_w) = boff;
1051 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1052 } while (--n != 0);
1053 }
1054 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1055 for (size_t ky = oy; ky < kh; ky += sh) {
1056 for (size_t kx = ox; kx < kw; kx += sw) {
1057 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
1058 const size_t kr_block_size = min(kc - kr_block_start, kr);
1059 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1060 int32_t ksum = 0;
1061 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1062 const uint8_t kv =
1063 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
1064 ksum += (int32_t) kv;
1065 *((uint8_t*) packed_w) = kv;
1066 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1067 }
Marat Dukhanb42f8662020-07-06 20:46:13 -07001068 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001069 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
1070 }
1071 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
1072 }
1073 }
1074 }
1075 }
1076 }
1077 }
1078 k += kh * kw * kc * nc;
1079 if XNN_UNPREDICTABLE(b != NULL) {
1080 b += nc;
1081 }
1082 }
1083}
1084
Marat Dukhana6879bd2020-07-06 14:25:08 -07001085void xnn_pack_f32_dwconv_ghw_w(
1086 size_t h,
1087 size_t w,
1088 size_t c,
1089 size_t cr,
1090 const float* k,
1091 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001092 float* packed_w,
1093 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001094{
1095 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1096 const size_t cr_block_size = min(c - cr_block_start, cr);
1097 if XNN_LIKELY(b != NULL) {
1098 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1099 *packed_w++ = b[cr_block_start + cr_block_offset];
1100 }
1101 } else {
1102 size_t n = cr_block_size;
1103 do {
1104 *packed_w++ = 0.0f;
1105 } while (--n != 0);
1106 }
1107 packed_w += cr - cr_block_size;
1108 for (size_t x = 0; x < w; x++) {
1109 for (size_t y = 0; y < h; y++) {
1110 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1111 const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1112 *packed_w++ = kv;
1113 }
1114 packed_w += cr - cr_block_size;
1115 }
1116 }
1117 }
1118}
1119
1120void xnn_pack_f16_dwconv_ghw_w(
1121 size_t h,
1122 size_t w,
1123 size_t c,
1124 size_t cr,
1125 const uint16_t* k,
1126 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001127 uint16_t* packed_w,
1128 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001129{
1130 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1131 const size_t cr_block_size = min(c - cr_block_start, cr);
1132 if XNN_LIKELY(b != NULL) {
1133 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1134 *packed_w++ = b[cr_block_start + cr_block_offset];
1135 }
1136 } else {
1137 size_t n = cr_block_size;
1138 do {
1139 *packed_w++ = 0;
1140 } while (--n != 0);
1141 }
1142 packed_w += cr - cr_block_size;
1143 for (size_t x = 0; x < w; x++) {
1144 for (size_t y = 0; y < h; y++) {
1145 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1146 const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1147 *packed_w++ = kv;
1148 }
1149 packed_w += cr - cr_block_size;
1150 }
1151 }
1152 }
1153}
1154
Marat Dukhan08b7a972020-07-14 18:17:29 -07001155void xnn_pack_qu8_dwconv_ghw_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001156 size_t h,
1157 size_t w,
1158 size_t c,
1159 size_t cr,
Marat Dukhanab582382020-07-06 13:32:08 -07001160 const uint8_t* k,
1161 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001162 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001163 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001164{
Marat Dukhanb42f8662020-07-06 20:46:13 -07001165 const int32_t izp = (int32_t) params->input_zero_point;
1166 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001167 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1168 const size_t cr_block_size = min(c - cr_block_start, cr);
1169 int32_t* packed_b = (int32_t*) packed_w;
1170 if XNN_LIKELY(b != NULL) {
1171 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1172 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1173 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1174 }
1175 } else {
1176 size_t n = cr_block_size;
1177 do {
1178 *((int32_t*) packed_w) = boff;
1179 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1180 } while (--n != 0);
1181 }
1182 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1183 for (size_t x = 0; x < w; x++) {
1184 for (size_t y = 0; y < h; y++) {
1185 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1186 const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
Marat Dukhanb42f8662020-07-06 20:46:13 -07001187 packed_b[cr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001188 *((uint8_t*) packed_w) = kv;
1189 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1190 }
1191 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1192 }
1193 }
1194 }
1195}
1196
Marat Dukhanf62bbdc2020-08-04 13:59:04 -07001197void xnn_pack_qs8_dwconv_ghw_w(
1198 size_t h,
1199 size_t w,
1200 size_t c,
1201 size_t cr,
1202 const int8_t* k,
1203 const int32_t* b,
1204 void* packed_w,
1205 const struct xnn_qs8_packing_params* params)
1206{
1207 const int32_t izp = (int32_t) params->input_zero_point;
1208 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1209 const size_t cr_block_size = min(c - cr_block_start, cr);
1210 int32_t* packed_b = (int32_t*) packed_w;
1211 if XNN_LIKELY(b != NULL) {
1212 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1213 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
1214 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1215 }
1216 } else {
1217 size_t n = cr_block_size;
1218 do {
1219 *((int32_t*) packed_w) = 0;
1220 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1221 } while (--n != 0);
1222 }
1223 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1224 for (size_t x = 0; x < w; x++) {
1225 for (size_t y = 0; y < h; y++) {
1226 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1227 const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1228 packed_b[cr_block_offset] -= (int32_t) kv * izp;
1229 *((int8_t*) packed_w) = kv;
1230 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1231 }
1232 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1233 }
1234 }
1235 }
1236}
1237
Marat Dukhana6879bd2020-07-06 14:25:08 -07001238void xnn_pack_f32_dwconv_hwg_w(
1239 size_t h,
1240 size_t w,
1241 size_t c,
1242 size_t cr,
1243 const float* k,
1244 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001245 float* packed_w,
1246 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001247{
1248 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1249 const size_t cr_block_size = min(c - cr_block_start, cr);
1250 if XNN_LIKELY(b != NULL) {
1251 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1252 *packed_w++ = b[cr_block_start + cr_block_offset];
1253 }
1254 } else {
1255 size_t n = cr_block_size;
1256 do {
1257 *packed_w++ = 0.0f;
1258 } while (--n != 0);
1259 }
1260 packed_w += cr - cr_block_size;
1261 for (size_t x = 0; x < w; x++) {
1262 for (size_t y = 0; y < h; y++) {
1263 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1264 const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1265 *packed_w++ = kv;
1266 }
1267 packed_w += cr - cr_block_size;
1268 }
1269 }
1270 }
1271}
1272
1273void xnn_pack_f16_dwconv_hwg_w(
1274 size_t h,
1275 size_t w,
1276 size_t c,
1277 size_t cr,
1278 const uint16_t* k,
1279 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001280 uint16_t* packed_w,
1281 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001282{
1283 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1284 const size_t cr_block_size = min(c - cr_block_start, cr);
1285 if XNN_LIKELY(b != NULL) {
1286 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1287 *packed_w++ = b[cr_block_start + cr_block_offset];
1288 }
1289 } else {
1290 size_t n = cr_block_size;
1291 do {
1292 *packed_w++ = 0;
1293 } while (--n != 0);
1294 }
1295 packed_w += cr - cr_block_size;
1296 for (size_t x = 0; x < w; x++) {
1297 for (size_t y = 0; y < h; y++) {
1298 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1299 const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1300 *packed_w++ = kv;
1301 }
1302 packed_w += cr - cr_block_size;
1303 }
1304 }
1305 }
1306}
1307
Marat Dukhan08b7a972020-07-14 18:17:29 -07001308void xnn_pack_qu8_dwconv_hwg_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001309 size_t h,
1310 size_t w,
1311 size_t c,
1312 size_t cr,
Marat Dukhanab582382020-07-06 13:32:08 -07001313 const uint8_t* k,
1314 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001315 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001316 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001317{
Marat Dukhanb42f8662020-07-06 20:46:13 -07001318 const int32_t izp = (int32_t) params->input_zero_point;
1319 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001320 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1321 const size_t cr_block_size = min(c - cr_block_start, cr);
1322 int32_t* packed_b = (int32_t*) packed_w;
1323 if XNN_LIKELY(b != NULL) {
1324 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1325 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1326 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1327 }
1328 } else {
1329 size_t n = cr_block_size;
1330 do {
1331 *((int32_t*) packed_w) = boff;
1332 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1333 } while (--n != 0);
1334 }
1335 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1336 for (size_t x = 0; x < w; x++) {
1337 for (size_t y = 0; y < h; y++) {
1338 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1339 const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
Marat Dukhanb42f8662020-07-06 20:46:13 -07001340 packed_b[cr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001341 *((uint8_t*) packed_w) = kv;
1342 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1343 }
1344 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1345 }
1346 }
1347 }
1348}
1349
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001350void xnn_pack_qs8_dwconv_hwg_w(
1351 size_t h,
1352 size_t w,
1353 size_t c,
1354 size_t cr,
1355 const int8_t* k,
1356 const int32_t* b,
1357 void* packed_w,
1358 const struct xnn_qs8_packing_params* params)
1359{
1360 const int32_t izp = (int32_t) params->input_zero_point;
1361 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1362 const size_t cr_block_size = min(c - cr_block_start, cr);
1363 int32_t* packed_b = (int32_t*) packed_w;
1364 if XNN_LIKELY(b != NULL) {
1365 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1366 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
1367 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1368 }
1369 } else {
1370 size_t n = cr_block_size;
1371 do {
1372 *((int32_t*) packed_w) = 0;
1373 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1374 } while (--n != 0);
1375 }
1376 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1377 for (size_t x = 0; x < w; x++) {
1378 for (size_t y = 0; y < h; y++) {
1379 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1380 const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1381 packed_b[cr_block_offset] -= (int32_t) kv * izp;
1382 *((int8_t*) packed_w) = kv;
1383 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1384 }
1385 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1386 }
1387 }
1388 }
1389}
1390
Marat Dukhana6879bd2020-07-06 14:25:08 -07001391void xnn_pack_f32_gemminc_goi_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001392 size_t g,
1393 size_t nc,
1394 size_t kc,
1395 size_t nr,
1396 size_t kr,
1397 size_t sr,
Marat Dukhana6879bd2020-07-06 14:25:08 -07001398 const float* k,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001399 float* packed_w,
1400 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001401{
1402 const size_t skr = sr * kr;
1403 const size_t skc = round_down_po2(kc, skr);
1404 const size_t sr_mask = (sr - 1) * kr;
1405 do {
1406 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1407 const size_t nr_block_size = min(nc - nr_block_start, nr);
Marat Dukhanab582382020-07-06 13:32:08 -07001408
1409 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1410 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1411 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1412 *packed_w++ =
1413 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1414 }
1415 }
1416 packed_w += (nr - nr_block_size) * kr;
1417 }
1418
1419 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1420 const size_t kr_block_size = min(kc - kr_block_start, kr);
1421 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1422 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1423 *packed_w++ =
1424 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1425 }
1426 packed_w += kr - kr_block_size;
1427 }
1428 packed_w += (nr - nr_block_size) * kr;
1429 }
1430 }
1431 k += nc * kc;
Marat Dukhanab582382020-07-06 13:32:08 -07001432 } while (--g != 0);
1433}
1434
Marat Dukhanab582382020-07-06 13:32:08 -07001435void xnn_pack_f16_gemminc_goi_w(
1436 size_t g,
1437 size_t nc,
1438 size_t kc,
1439 size_t nr,
1440 size_t kr,
1441 size_t sr,
1442 const uint16_t* k,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001443 uint16_t* packed_w,
1444 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001445{
1446 const size_t skr = sr * kr;
1447 const size_t skc = round_down_po2(kc, skr);
1448 const size_t sr_mask = (sr - 1) * kr;
1449 do {
1450 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1451 const size_t nr_block_size = min(nc - nr_block_start, nr);
1452
1453 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1454 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1455 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1456 *packed_w++ =
1457 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1458 }
1459 }
1460 packed_w += (nr - nr_block_size) * kr;
1461 }
1462
1463 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1464 const size_t kr_block_size = min(kc - kr_block_start, kr);
1465 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1466 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1467 *packed_w++ =
1468 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1469 }
1470 packed_w += kr - kr_block_size;
1471 }
1472 packed_w += (nr - nr_block_size) * kr;
1473 }
1474 }
1475 k += nc * kc;
1476 } while (--g != 0);
1477}
1478
Marat Dukhana6879bd2020-07-06 14:25:08 -07001479void xnn_pack_f32_dconv_oki_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001480 size_t nc,
Marat Dukhanab582382020-07-06 13:32:08 -07001481 size_t kc,
1482 size_t nr,
Marat Dukhana6879bd2020-07-06 14:25:08 -07001483 size_t kh,
1484 size_t kw,
1485 const float* k,
1486 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001487 float* packed_w,
1488 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001489{
Marat Dukhana6879bd2020-07-06 14:25:08 -07001490 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1491 const size_t nr_block_size = min(nc - nr_block_start, nr);
1492 if XNN_LIKELY(b != NULL) {
1493 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1494 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
Marat Dukhanab582382020-07-06 13:32:08 -07001495 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001496 } else {
1497 size_t n = nr;
1498 do {
1499 *packed_w++ = 0.0f;
1500 } while (--n != 0);
1501 }
Marat Dukhanab582382020-07-06 13:32:08 -07001502
Marat Dukhana6879bd2020-07-06 14:25:08 -07001503 for (size_t kx = 0; kx < kw; kx++) {
1504 for (size_t c = 0; c < kc; c++) {
1505 for (size_t ky = 0; ky < kh; ky++) {
1506 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1507 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
Marat Dukhanab582382020-07-06 13:32:08 -07001508 }
Marat Dukhanab582382020-07-06 13:32:08 -07001509 }
1510 }
1511 }
Marat Dukhanab582382020-07-06 13:32:08 -07001512 if XNN_UNPREDICTABLE(b != NULL) {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001513 b += nr;
Marat Dukhanab582382020-07-06 13:32:08 -07001514 }
1515 }
1516}
1517
1518void xnn_pack_f16_dconv_oki_w(
1519 size_t nc,
1520 size_t kc,
1521 size_t nr,
1522 size_t kh,
1523 size_t kw,
1524 const uint16_t* k,
1525 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001526 uint16_t* packed_w,
1527 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001528{
1529 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1530 const size_t nr_block_size = min(nc - nr_block_start, nr);
1531 if XNN_LIKELY(b != NULL) {
1532 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1533 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1534 }
1535 } else {
1536 size_t n = nr;
1537 do {
1538 *packed_w++ = 0;
1539 } while (--n != 0);
1540 }
1541
1542 for (size_t kx = 0; kx < kw; kx++) {
1543 for (size_t c = 0; c < kc; c++) {
1544 for (size_t ky = 0; ky < kh; ky++) {
1545 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1546 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1547 }
1548 }
1549 }
1550 }
1551 if XNN_UNPREDICTABLE(b != NULL) {
1552 b += nr;
1553 }
1554 }
1555}
1556
Marat Dukhana6879bd2020-07-06 14:25:08 -07001557void xnn_pack_f32_chw_dwconv_ghw_w(
1558 size_t kernel_size,
1559 size_t groups,
1560 const float* kernel,
1561 const float* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001562 float* packed_weights,
1563 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001564{
Marat Dukhana6879bd2020-07-06 14:25:08 -07001565 for (size_t g = 0; g < groups; g++) {
1566 if XNN_LIKELY(bias != NULL) {
1567 *packed_weights = *bias++;
Marat Dukhanab582382020-07-06 13:32:08 -07001568 } else {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001569 *packed_weights = 0.0f;
Marat Dukhanab582382020-07-06 13:32:08 -07001570 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001571 packed_weights += 1;
1572 for (size_t i = 0; i < kernel_size; i++) {
1573 *packed_weights++ = kernel[g * kernel_size + i];
Marat Dukhanab582382020-07-06 13:32:08 -07001574 }
1575 }
1576}
1577
1578void xnn_pack_f16_chw_dwconv_ghw_w(
1579 size_t kernel_size,
1580 size_t groups,
1581 const uint16_t* kernel,
1582 const uint16_t* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001583 uint16_t* packed_weights,
1584 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001585{
1586 for (size_t g = 0; g < groups; g++) {
1587 if XNN_LIKELY(bias != NULL) {
1588 *packed_weights = *bias++;
1589 } else {
1590 *packed_weights = 0;
1591 }
1592 packed_weights += 1;
1593 for (size_t i = 0; i < kernel_size; i++) {
1594 *packed_weights++ = kernel[g * kernel_size + i];
1595 }
1596 }
1597}
1598
Marat Dukhanab582382020-07-06 13:32:08 -07001599void xnn_pack_f32_chw_dwconv_hwg_w(
1600 size_t kernel_size,
1601 size_t groups,
1602 const float* kernel,
1603 const float* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001604 float* packed_weights,
1605 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001606{
1607 for (size_t g = 0; g < groups; g++) {
1608 if XNN_LIKELY(bias != NULL) {
1609 *packed_weights = *bias++;
1610 } else {
1611 *packed_weights = 0.0f;
1612 }
1613 packed_weights += 1;
1614 for (size_t i = 0; i < kernel_size; i++) {
1615 *packed_weights++ = kernel[i * groups + g];
1616 }
1617 }
1618}
1619
1620void xnn_pack_f32_vmulcaddc_w(
1621 size_t c,
1622 size_t cr,
1623 const float* s,
1624 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001625 float* packed_w,
1626 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001627{
1628 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1629 const size_t cr_block_size = min(c - cr_block_start, cr);
1630 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1631 *packed_w++ = s[cr_block_start + cr_block_offset];
1632 }
1633 packed_w += cr - cr_block_size;
1634 if XNN_LIKELY(b != NULL) {
1635 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1636 *packed_w++ = b[cr_block_start + cr_block_offset];
1637 }
1638 } else {
1639 size_t n = cr_block_size;
1640 do {
1641 *packed_w++ = 0.0f;
1642 } while (--n != 0);
1643 }
1644 packed_w += cr - cr_block_size;
1645 }
1646}
1647
1648void xnn_pack_f16_vmulcaddc_w(
1649 size_t c,
1650 size_t cr,
1651 const uint16_t* s,
1652 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001653 uint16_t* packed_w,
1654 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001655{
1656 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1657 const size_t cr_block_size = min(c - cr_block_start, cr);
1658 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1659 *packed_w++ = s[cr_block_start + cr_block_offset];
1660 }
1661 packed_w += cr - cr_block_size;
1662 if XNN_LIKELY(b != NULL) {
1663 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1664 *packed_w++ = b[cr_block_start + cr_block_offset];
1665 }
1666 } else {
1667 size_t n = cr_block_size;
1668 do {
1669 *packed_w++ = 0;
1670 } while (--n != 0);
1671 }
1672 packed_w += cr - cr_block_size;
1673 }
1674}