blob: 49a3c2abfe33c56c524001eaca0b00128211f38b [file] [log] [blame]
Marat Dukhanab582382020-07-06 13:32:08 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <stdint.h>
10#include <stddef.h>
11
12#include <xnnpack/math.h>
13#include <xnnpack/pack.h>
14
15
Marat Dukhana6879bd2020-07-06 14:25:08 -070016void xnn_pack_f32_gemm_goi_w(
17 size_t g,
18 size_t nc,
19 size_t kc,
20 size_t nr,
21 size_t kr,
22 size_t sr,
23 const float* k,
24 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -070025 float* packed_w,
26 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -070027{
28 const size_t skr = sr * kr;
29 const size_t skc = round_down_po2(kc, skr);
30 const size_t sr_mask = (sr - 1) * kr;
31 do {
32 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
33 const size_t nr_block_size = min(nc - nr_block_start, nr);
34 if XNN_LIKELY(b != NULL) {
35 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
36 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
37 }
38 }
39 packed_w += nr;
40
41 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
42 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
43 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
44 *packed_w++ =
45 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
46 }
47 }
48 packed_w += (nr - nr_block_size) * kr;
49 }
50
51 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
52 const size_t kr_block_size = min(kc - kr_block_start, kr);
53 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
54 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
55 *packed_w++ =
56 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
57 }
58 packed_w += kr - kr_block_size;
59 }
60 packed_w += (nr - nr_block_size) * kr;
61 }
62 }
63 k += nc * kc;
64 if XNN_UNPREDICTABLE(b != NULL) {
65 b += nc;
66 }
67 } while (--g != 0);
68}
69
70void xnn_pack_f16_gemm_goi_w(
71 size_t g,
72 size_t nc,
73 size_t kc,
74 size_t nr,
75 size_t kr,
76 size_t sr,
77 const uint16_t* k,
78 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -070079 uint16_t* packed_w,
80 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -070081{
82 const size_t skr = sr * kr;
83 const size_t skc = round_down_po2(kc, skr);
84 const size_t sr_mask = (sr - 1) * kr;
85 do {
86 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
87 const size_t nr_block_size = min(nc - nr_block_start, nr);
88 if XNN_LIKELY(b != NULL) {
89 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
90 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
91 }
92 }
93 packed_w += nr;
94
95 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
96 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
97 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
98 *packed_w++ =
99 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
100 }
101 }
102 packed_w += (nr - nr_block_size) * kr;
103 }
104
105 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
106 const size_t kr_block_size = min(kc - kr_block_start, kr);
107 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
108 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
109 *packed_w++ =
110 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
111 }
112 packed_w += kr - kr_block_size;
113 }
114 packed_w += (nr - nr_block_size) * kr;
115 }
116 }
117 k += nc * kc;
118 if XNN_UNPREDICTABLE(b != NULL) {
119 b += nc;
120 }
121 } while (--g != 0);
122}
123
Marat Dukhan08b7a972020-07-14 18:17:29 -0700124void xnn_pack_qu8_gemm_goi_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700125 size_t g,
126 size_t nc,
127 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700128 size_t nr,
129 size_t kr,
Marat Dukhanbc08f312020-07-07 16:22:04 -0700130 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700131 const uint8_t* k,
132 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700133 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700134 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700135{
Marat Dukhanbc08f312020-07-07 16:22:04 -0700136 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700137 const int32_t izp = (int32_t) params->input_zero_point;
138 const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700139 do {
140 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
141 const size_t nr_block_size = min(nc - nr_block_start, nr);
142 int32_t* packed_b = (int32_t*) packed_w;
143 if XNN_LIKELY(b != NULL) {
144 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
145 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
146 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
147 }
148 } else {
149 size_t n = nr_block_size;
150 do {
151 *((int32_t*) packed_w) = boff;
152 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
153 } while (--n != 0);
154 }
155 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
156 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
157 const size_t kr_block_size = min(kc - kr_block_start, kr);
158 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
159 int32_t ksum = 0;
160 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
161 const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
162 ksum += (int32_t) kv;
163 *((uint8_t*) packed_w) = kv;
164 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
165 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700166 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700167 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
168 }
169 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
170 }
171 }
172 k += nc * kc;
173 if XNN_UNPREDICTABLE(b != NULL) {
174 b += nc;
175 }
176 } while (--g != 0);
177}
178
Marat Dukhan595e1702020-07-31 10:12:52 -0700179void xnn_pack_qs8_gemm_goi_w(
180 size_t g,
181 size_t nc,
182 size_t kc,
183 size_t nr,
184 size_t kr,
185 size_t sr,
186 const int8_t* k,
187 const int32_t* b,
188 void* packed_w,
189 const struct xnn_qs8_packing_params* params)
190{
191 assert(sr == 1);
192 const int32_t izp = (int32_t) params->input_zero_point;
193 do {
194 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
195 const size_t nr_block_size = min(nc - nr_block_start, nr);
196 int32_t* packed_b = (int32_t*) packed_w;
197 if XNN_LIKELY(b != NULL) {
198 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
199 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
200 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
201 }
202 } else {
203 size_t n = nr_block_size;
204 do {
205 *((int32_t*) packed_w) = 0;
206 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
207 } while (--n != 0);
208 }
209 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
210 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
211 const size_t kr_block_size = min(kc - kr_block_start, kr);
212 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
213 int32_t ksum = 0;
214 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
215 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
216 ksum += (int32_t) kv;
217 *((int8_t*) packed_w) = kv;
218 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
219 }
220 packed_b[nr_block_offset] -= ksum * izp;
221 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
222 }
223 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
224 }
225 }
226 k += nc * kc;
227 if XNN_UNPREDICTABLE(b != NULL) {
228 b += nc;
229 }
230 } while (--g != 0);
231}
232
Marat Dukhan683fab32020-08-03 19:42:52 -0700233void xnn_pack_qs8_gemm_xw_goi_w(
234 size_t g,
235 size_t nc,
236 size_t kc,
237 size_t nr,
238 size_t kr,
239 size_t sr,
240 const int8_t* k,
241 const int32_t* b,
242 void* packed_w,
243 const struct xnn_qs8_packing_params* params)
244{
245 assert(sr == 1);
246 const int32_t izp = (int32_t) params->input_zero_point;
247 do {
248 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
249 const size_t nr_block_size = min(nc - nr_block_start, nr);
250 int32_t* packed_b = (int32_t*) packed_w;
251 if XNN_LIKELY(b != NULL) {
252 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
253 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
254 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
255 }
256 } else {
257 size_t n = nr_block_size;
258 do {
259 *((int32_t*) packed_w) = 0;
260 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
261 } while (--n != 0);
262 }
263 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
264 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
265 const size_t kr_block_size = min(kc - kr_block_start, kr);
266 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
267 int32_t ksum = 0;
268 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
269 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
270 ksum += (int32_t) kv;
271 *((int16_t*) packed_w) = (int16_t) kv;
272 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int16_t));
273 }
274 packed_b[nr_block_offset] -= ksum * izp;
275 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int16_t));
276 }
277 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int16_t));
278 }
279 }
280 k += nc * kc;
281 if XNN_UNPREDICTABLE(b != NULL) {
282 b += nc;
283 }
284 } while (--g != 0);
285}
286
Marat Dukhana6879bd2020-07-06 14:25:08 -0700287void xnn_pack_f32_gemm_io_w(
288 size_t nc,
289 size_t kc,
290 size_t nr,
291 size_t kr,
292 size_t sr,
293 const float* k,
294 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700295 float* packed_w,
296 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700297{
298 const size_t skr = sr * kr;
299 const size_t skc = round_down_po2(kc, skr);
300 const size_t sr_mask = (sr - 1) * kr;
301 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
302 const size_t nr_block_size = min(nc - nr_block_start, nr);
303 if XNN_LIKELY(b != NULL) {
304 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
305 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
306 }
307 }
308 packed_w += nr;
309
310 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
311 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
312 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
313 *packed_w++ =
314 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
315 }
316 }
317 packed_w += (nr - nr_block_size) * kr;
318 }
319
320 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
321 const size_t kr_block_size = min(kc - kr_block_start, kr);
322 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
323 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
324 *packed_w++ =
325 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
326 }
327 packed_w += kr - kr_block_size;
328 }
329 packed_w += (nr - nr_block_size) * kr;
330 }
331 }
332}
333
334void xnn_pack_f16_gemm_io_w(
335 size_t nc,
336 size_t kc,
337 size_t nr,
338 size_t kr,
339 size_t sr,
340 const uint16_t* k,
341 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700342 uint16_t* packed_w,
343 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700344{
345 const size_t skr = sr * kr;
346 const size_t skc = round_down_po2(kc, skr);
347 const size_t sr_mask = (sr - 1) * kr;
348 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
349 const size_t nr_block_size = min(nc - nr_block_start, nr);
350 if XNN_LIKELY(b != NULL) {
351 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
352 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
353 }
354 }
355 packed_w += nr;
356
357 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
358 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
359 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
360 *packed_w++ =
361 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
362 }
363 }
364 packed_w += (nr - nr_block_size) * kr;
365 }
366
367 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
368 const size_t kr_block_size = min(kc - kr_block_start, kr);
369 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
370 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
371 *packed_w++ =
372 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
373 }
374 packed_w += kr - kr_block_size;
375 }
376 packed_w += (nr - nr_block_size) * kr;
377 }
378 }
379}
380
Marat Dukhan08b7a972020-07-14 18:17:29 -0700381void xnn_pack_qu8_gemm_io_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700382 size_t nc,
383 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700384 size_t nr,
385 size_t kr,
Marat Dukhanbc08f312020-07-07 16:22:04 -0700386 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700387 const uint8_t* k,
388 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700389 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700390 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700391{
Marat Dukhanbc08f312020-07-07 16:22:04 -0700392 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700393 const int32_t izp = (int32_t) params->input_zero_point;
394 const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700395 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
396 const size_t nr_block_size = min(nc - nr_block_start, nr);
397 int32_t* packed_b = (int32_t*) packed_w;
398 if XNN_LIKELY(b != NULL) {
399 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
400 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
401 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
402 }
403 } else {
404 size_t n = nr_block_size;
405 do {
406 *((int32_t*) packed_w) = boff;
407 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
408 } while (--n != 0);
409 }
410 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
411 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
412 const size_t kr_block_size = min(kc - kr_block_start, kr);
413 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
414 int32_t ksum = 0;
415 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
416 const uint8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
417 ksum += (int32_t) kv;
418 *((uint8_t*) packed_w) = kv;
419 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
420 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700421 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700422 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
423 }
424 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
425 }
426 }
427}
428
Marat Dukhana6879bd2020-07-06 14:25:08 -0700429void xnn_pack_f32_conv_goki_w(
430 size_t g,
431 size_t nc,
432 size_t ks,
433 size_t kc,
434 size_t nr,
435 size_t kr,
436 size_t sr,
437 const float* k,
438 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700439 float* packed_w,
440 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700441{
442 const size_t skr = sr * kr;
443 const size_t skc = round_down_po2(kc, skr);
444 const size_t sr_mask = (sr - 1) * kr;
445 do {
446 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
447 const size_t nr_block_size = min(nc - nr_block_start, nr);
448 if XNN_LIKELY(b != NULL) {
449 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
450 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
451 }
452 }
453 packed_w += nr;
454
455 for (size_t ki = 0; ki < ks; ki++) {
456 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
457 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
458 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
459 *packed_w++ =
460 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
461 }
462 }
463 packed_w += (nr - nr_block_size) * kr;
464 }
465
466 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
467 const size_t kr_block_size = min(kc - kr_block_start, kr);
468 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
469 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
470 *packed_w++ =
471 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
472 }
473 packed_w += kr - kr_block_size;
474 }
475 packed_w += (nr - nr_block_size) * kr;
476 }
477 }
478 }
479 k += ks * kc * nc;
480 if XNN_UNPREDICTABLE(b != NULL) {
481 b += nc;
482 }
483 } while (--g != 0);
484}
485
486void xnn_pack_f16_conv_goki_w(
487 size_t g,
488 size_t nc,
489 size_t ks,
490 size_t kc,
491 size_t nr,
492 size_t kr,
493 size_t sr,
494 const uint16_t* k,
495 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700496 uint16_t* packed_w,
497 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700498{
499 const size_t skr = sr * kr;
500 const size_t skc = round_down_po2(kc, skr);
501 const size_t sr_mask = (sr - 1) * kr;
502 do {
503 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
504 const size_t nr_block_size = min(nc - nr_block_start, nr);
505 if XNN_LIKELY(b != NULL) {
506 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
507 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
508 }
509 }
510 packed_w += nr;
511
512 for (size_t ki = 0; ki < ks; ki++) {
513 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
514 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
515 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
516 *packed_w++ =
517 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
518 }
519 }
520 packed_w += (nr - nr_block_size) * kr;
521 }
522
523 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
524 const size_t kr_block_size = min(kc - kr_block_start, kr);
525 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
526 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
527 *packed_w++ =
528 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
529 }
530 packed_w += kr - kr_block_size;
531 }
532 packed_w += (nr - nr_block_size) * kr;
533 }
534 }
535 }
536 k += ks * kc * nc;
537 if XNN_UNPREDICTABLE(b != NULL) {
538 b += nc;
539 }
540 } while (--g != 0);
541}
542
Marat Dukhan08b7a972020-07-14 18:17:29 -0700543void xnn_pack_qu8_conv_goki_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700544 size_t g,
545 size_t nc,
546 size_t ks,
547 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700548 size_t nr,
549 size_t kr,
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700550 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700551 const uint8_t* k,
552 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700553 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700554 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700555{
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700556 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700557 const int32_t izp = (int32_t) params->input_zero_point;
558 const int32_t boff = (int32_t) ks * (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700559 do {
560 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
561 const size_t nr_block_size = min(nc - nr_block_start, nr);
562 int32_t* packed_b = (int32_t*) packed_w;
563 if XNN_LIKELY(b != NULL) {
564 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
565 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
566 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
567 }
568 } else {
569 size_t n = nr_block_size;
570 do {
571 *((int32_t*) packed_w) = boff;
572 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
573 } while (--n != 0);
574 }
575 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
576 for (size_t ki = 0; ki < ks; ki++) {
577 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
578 const size_t kr_block_size = min(kc - kr_block_start, kr);
579 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
580 int32_t ksum = 0;
581 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
582 const uint8_t kv =
583 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
584 ksum += (int32_t) kv;
585 *((uint8_t*) packed_w) = kv;
586 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
587 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700588 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700589 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
590 }
591 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
592 }
593 }
594 }
595 k += ks * kc * nc;
596 if XNN_UNPREDICTABLE(b != NULL) {
597 b += nc;
598 }
599 } while (--g != 0);
600}
601
Marat Dukhanf9480682020-07-31 14:50:24 -0700602void xnn_pack_qs8_conv_goki_w(
603 size_t g,
604 size_t nc,
605 size_t ks,
606 size_t kc,
607 size_t nr,
608 size_t kr,
609 size_t sr,
610 const int8_t* k,
611 const int32_t* b,
612 void* packed_w,
613 const struct xnn_qs8_packing_params* params)
614{
615 assert(sr == 1);
616 const int32_t izp = (int32_t) params->input_zero_point;
617 do {
618 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
619 const size_t nr_block_size = min(nc - nr_block_start, nr);
620 int32_t* packed_b = (int32_t*) packed_w;
621 if XNN_LIKELY(b != NULL) {
622 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
623 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
624 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
625 }
626 } else {
627 size_t n = nr_block_size;
628 do {
629 *((int32_t*) packed_w) = 0;
630 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
631 } while (--n != 0);
632 }
633 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
634 for (size_t ki = 0; ki < ks; ki++) {
635 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
636 const size_t kr_block_size = min(kc - kr_block_start, kr);
637 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
638 int32_t ksum = 0;
639 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
640 const int8_t kv =
641 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
642 ksum += (int32_t) kv;
643 *((int8_t*) packed_w) = kv;
644 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
645 }
646 packed_b[nr_block_offset] -= ksum * izp;
647 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
648 }
649 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
650 }
651 }
652 }
653 k += ks * kc * nc;
654 if XNN_UNPREDICTABLE(b != NULL) {
655 b += nc;
656 }
657 } while (--g != 0);
658}
659
Marat Dukhana6879bd2020-07-06 14:25:08 -0700660void xnn_pack_f32_conv_kgo_w(
661 size_t g,
662 size_t nc,
663 size_t ks,
664 size_t nr,
665 size_t kr,
666 const float* k,
667 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700668 float* packed_w,
669 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700670{
671 for (size_t i = 0; i < g; i++) {
672 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
673 const size_t nr_block_size = min(nc - nr_block_start, nr);
674 if XNN_LIKELY(b != NULL) {
675 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
676 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
677 }
678 }
679 packed_w += nr;
680 for (size_t ki = 0; ki < ks; ki++) {
681 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
682 *packed_w =
683 k[ki * g * nc + (nr_block_start + nr_block_offset)];
684 packed_w += kr;
685 }
686 packed_w += (nr - nr_block_size) * kr;
687 }
688 }
689 k += nc;
690 if XNN_UNPREDICTABLE(b != NULL) {
691 b += nc;
692 }
693 }
694}
695
696void xnn_pack_f16_conv_kgo_w(
697 size_t g,
698 size_t nc,
699 size_t ks,
700 size_t nr,
701 size_t kr,
702 const uint16_t* k,
703 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700704 uint16_t* packed_w,
705 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700706{
707 for (size_t i = 0; i < g; i++) {
708 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
709 const size_t nr_block_size = min(nc - nr_block_start, nr);
710 if XNN_LIKELY(b != NULL) {
711 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
712 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
713 }
714 }
715 packed_w += nr;
716 for (size_t ki = 0; ki < ks; ki++) {
717 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
718 *packed_w =
719 k[ki * g * nc + (nr_block_start + nr_block_offset)];
720 packed_w += kr;
721 }
722 packed_w += (nr - nr_block_size) * kr;
723 }
724 }
725 k += nc;
726 if XNN_UNPREDICTABLE(b != NULL) {
727 b += nc;
728 }
729 }
730}
731
Marat Dukhan08b7a972020-07-14 18:17:29 -0700732void xnn_pack_qu8_conv_kgo_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700733 size_t g,
734 size_t nc,
735 size_t ks,
Marat Dukhana6879bd2020-07-06 14:25:08 -0700736 size_t nr,
737 size_t kr,
Marat Dukhanab582382020-07-06 13:32:08 -0700738 const uint8_t* k,
739 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700740 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700741 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700742{
Marat Dukhanb42f8662020-07-06 20:46:13 -0700743 const int32_t izp = (int32_t) params->input_zero_point;
744 const int32_t boff = (int32_t) ks * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700745 for (size_t i = 0; i < g; i++) {
746 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
747 const size_t nr_block_size = min(nc - nr_block_start, nr);
748 int32_t* packed_b = (int32_t*) packed_w;
749 if XNN_LIKELY(b != NULL) {
750 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
751 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
752 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
753 }
754 } else {
755 size_t n = nr_block_size;
756 do {
757 *((int32_t*) packed_w) = boff;
758 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
759 } while (--n != 0);
760 }
761 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
762 for (size_t ki = 0; ki < ks; ki++) {
763 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
764 const uint8_t kv =
765 k[ki * g * nc + (nr_block_start + nr_block_offset)];
766 *((uint8_t*) packed_w) = kv;
Marat Dukhanb42f8662020-07-06 20:46:13 -0700767 packed_b[nr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700768 packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(uint8_t));
769 }
770 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
771 }
772 }
773 k += nc;
774 if XNN_UNPREDICTABLE(b != NULL) {
775 b += nc;
776 }
777 }
778}
779
Marat Dukhana6879bd2020-07-06 14:25:08 -0700780void xnn_pack_f32_deconv_goki_w(
781 size_t g,
782 size_t nc,
783 size_t kh,
784 size_t kw,
785 size_t kc,
786 size_t sh,
787 size_t sw,
788 size_t nr,
789 size_t kr,
790 size_t sr,
791 const float* k,
792 const float* b,
793 float* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700794 struct subconvolution_params* subconv_params,
795 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700796{
797 const size_t skr = sr * kr;
798 const size_t skc = round_down_po2(kc, skr);
799 const size_t sr_mask = (sr - 1) * kr;
800 for (size_t i = 0; i < g; i++) {
801 for (size_t oy = 0; oy < sh; oy++) {
802 for (size_t ox = 0; ox < sw; ox++) {
803 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -0700804 (*subconv_params++).weights = packed_w;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700805 }
806 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
807 const size_t nr_block_size = min(nc - nr_block_start, nr);
808 if XNN_LIKELY(b != NULL) {
809 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
810 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
811 }
812 }
813 packed_w += nr;
814 for (size_t ky = oy; ky < kh; ky += sh) {
815 for (size_t kx = ox; kx < kw; kx += sw) {
816 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
817 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
818 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
819 *packed_w++ =
820 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
821 }
822 }
823 packed_w += (nr - nr_block_size) * kr;
824 }
825
826 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
827 const size_t kr_block_size = min(kc - kr_block_start, kr);
828 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
829 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
830 *packed_w++ =
831 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
832 }
833 packed_w += kr - kr_block_size;
834 }
835 packed_w += (nr - nr_block_size) * kr;
836 }
837 }
838 }
839 }
840 }
841 }
842 k += kh * kw * kc * nc;
843 if XNN_UNPREDICTABLE(b != NULL) {
844 b += nc;
845 }
846 }
847}
848
849void xnn_pack_f16_deconv_goki_w(
850 size_t g,
851 size_t nc,
852 size_t kh,
853 size_t kw,
854 size_t kc,
855 size_t sh,
856 size_t sw,
857 size_t nr,
858 size_t kr,
859 size_t sr,
860 const uint16_t* k,
861 const uint16_t* b,
862 uint16_t* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700863 struct subconvolution_params* subconv_params,
864 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700865{
866 const size_t skr = sr * kr;
867 const size_t skc = round_down_po2(kc, skr);
868 const size_t sr_mask = (sr - 1) * kr;
869 for (size_t i = 0; i < g; i++) {
870 for (size_t oy = 0; oy < sh; oy++) {
871 for (size_t ox = 0; ox < sw; ox++) {
872 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -0700873 (*subconv_params++).weights = packed_w;
Marat Dukhana6879bd2020-07-06 14:25:08 -0700874 }
875 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
876 const size_t nr_block_size = min(nc - nr_block_start, nr);
877 if XNN_LIKELY(b != NULL) {
878 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
879 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
880 }
881 }
882 packed_w += nr;
883 for (size_t ky = oy; ky < kh; ky += sh) {
884 for (size_t kx = ox; kx < kw; kx += sw) {
885 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
886 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
887 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
888 *packed_w++ =
889 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
890 }
891 }
892 packed_w += (nr - nr_block_size) * kr;
893 }
894
895 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
896 const size_t kr_block_size = min(kc - kr_block_start, kr);
897 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
898 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
899 *packed_w++ =
900 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
901 }
902 packed_w += kr - kr_block_size;
903 }
904 packed_w += (nr - nr_block_size) * kr;
905 }
906 }
907 }
908 }
909 }
910 }
911 k += kh * kw * kc * nc;
912 if XNN_UNPREDICTABLE(b != NULL) {
913 b += nc;
914 }
915 }
916}
917
Marat Dukhan08b7a972020-07-14 18:17:29 -0700918void xnn_pack_qu8_deconv_goki_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700919 size_t g,
920 size_t nc,
921 size_t kh,
922 size_t kw,
923 size_t kc,
924 size_t sh,
925 size_t sw,
926 size_t nr,
927 size_t kr,
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700928 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700929 const uint8_t* k,
930 const int32_t* b,
931 void* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700932 struct subconvolution_params* subconv_params,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700933 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700934{
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700935 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700936 const int32_t izp = (int32_t) params->input_zero_point;
937 const int32_t kzp = (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700938 for (size_t i = 0; i < g; i++) {
939 for (size_t oy = 0; oy < sh; oy++) {
940 for (size_t ox = 0; ox < sw; ox++) {
941 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -0700942 (*subconv_params++).weights = packed_w;
Marat Dukhanab582382020-07-06 13:32:08 -0700943 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700944 const int32_t boff = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * izp * kzp;
Marat Dukhanab582382020-07-06 13:32:08 -0700945 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
946 const size_t nr_block_size = min(nc - nr_block_start, nr);
947 int32_t* packed_b = (int32_t*) packed_w;
948 if XNN_LIKELY(b != 0) {
949 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
950 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
951 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
952 }
953 } else {
954 size_t n = nr_block_size;
955 do {
956 *((int32_t*) packed_w) = boff;
957 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
958 } while (--n != 0);
959 }
960 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
961 for (size_t ky = oy; ky < kh; ky += sh) {
962 for (size_t kx = ox; kx < kw; kx += sw) {
963 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
964 const size_t kr_block_size = min(kc - kr_block_start, kr);
965 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
966 int32_t ksum = 0;
967 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
968 const uint8_t kv =
969 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
970 ksum += (int32_t) kv;
971 *((uint8_t*) packed_w) = kv;
972 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
973 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700974 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700975 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
976 }
977 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
978 }
979 }
980 }
981 }
982 }
983 }
984 k += kh * kw * kc * nc;
985 if XNN_UNPREDICTABLE(b != NULL) {
986 b += nc;
987 }
988 }
989}
990
Marat Dukhana6879bd2020-07-06 14:25:08 -0700991void xnn_pack_f32_dwconv_ghw_w(
992 size_t h,
993 size_t w,
994 size_t c,
995 size_t cr,
996 const float* k,
997 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700998 float* packed_w,
999 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001000{
1001 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1002 const size_t cr_block_size = min(c - cr_block_start, cr);
1003 if XNN_LIKELY(b != NULL) {
1004 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1005 *packed_w++ = b[cr_block_start + cr_block_offset];
1006 }
1007 } else {
1008 size_t n = cr_block_size;
1009 do {
1010 *packed_w++ = 0.0f;
1011 } while (--n != 0);
1012 }
1013 packed_w += cr - cr_block_size;
1014 for (size_t x = 0; x < w; x++) {
1015 for (size_t y = 0; y < h; y++) {
1016 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1017 const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1018 *packed_w++ = kv;
1019 }
1020 packed_w += cr - cr_block_size;
1021 }
1022 }
1023 }
1024}
1025
1026void xnn_pack_f16_dwconv_ghw_w(
1027 size_t h,
1028 size_t w,
1029 size_t c,
1030 size_t cr,
1031 const uint16_t* k,
1032 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001033 uint16_t* packed_w,
1034 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001035{
1036 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1037 const size_t cr_block_size = min(c - cr_block_start, cr);
1038 if XNN_LIKELY(b != NULL) {
1039 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1040 *packed_w++ = b[cr_block_start + cr_block_offset];
1041 }
1042 } else {
1043 size_t n = cr_block_size;
1044 do {
1045 *packed_w++ = 0;
1046 } while (--n != 0);
1047 }
1048 packed_w += cr - cr_block_size;
1049 for (size_t x = 0; x < w; x++) {
1050 for (size_t y = 0; y < h; y++) {
1051 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1052 const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1053 *packed_w++ = kv;
1054 }
1055 packed_w += cr - cr_block_size;
1056 }
1057 }
1058 }
1059}
1060
Marat Dukhan08b7a972020-07-14 18:17:29 -07001061void xnn_pack_qu8_dwconv_ghw_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001062 size_t h,
1063 size_t w,
1064 size_t c,
1065 size_t cr,
Marat Dukhanab582382020-07-06 13:32:08 -07001066 const uint8_t* k,
1067 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001068 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001069 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001070{
Marat Dukhanb42f8662020-07-06 20:46:13 -07001071 const int32_t izp = (int32_t) params->input_zero_point;
1072 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001073 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1074 const size_t cr_block_size = min(c - cr_block_start, cr);
1075 int32_t* packed_b = (int32_t*) packed_w;
1076 if XNN_LIKELY(b != NULL) {
1077 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1078 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1079 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1080 }
1081 } else {
1082 size_t n = cr_block_size;
1083 do {
1084 *((int32_t*) packed_w) = boff;
1085 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1086 } while (--n != 0);
1087 }
1088 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1089 for (size_t x = 0; x < w; x++) {
1090 for (size_t y = 0; y < h; y++) {
1091 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1092 const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
Marat Dukhanb42f8662020-07-06 20:46:13 -07001093 packed_b[cr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001094 *((uint8_t*) packed_w) = kv;
1095 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1096 }
1097 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1098 }
1099 }
1100 }
1101}
1102
Marat Dukhana6879bd2020-07-06 14:25:08 -07001103void xnn_pack_f32_dwconv_hwg_w(
1104 size_t h,
1105 size_t w,
1106 size_t c,
1107 size_t cr,
1108 const float* k,
1109 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001110 float* packed_w,
1111 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001112{
1113 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1114 const size_t cr_block_size = min(c - cr_block_start, cr);
1115 if XNN_LIKELY(b != NULL) {
1116 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1117 *packed_w++ = b[cr_block_start + cr_block_offset];
1118 }
1119 } else {
1120 size_t n = cr_block_size;
1121 do {
1122 *packed_w++ = 0.0f;
1123 } while (--n != 0);
1124 }
1125 packed_w += cr - cr_block_size;
1126 for (size_t x = 0; x < w; x++) {
1127 for (size_t y = 0; y < h; y++) {
1128 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1129 const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1130 *packed_w++ = kv;
1131 }
1132 packed_w += cr - cr_block_size;
1133 }
1134 }
1135 }
1136}
1137
1138void xnn_pack_f16_dwconv_hwg_w(
1139 size_t h,
1140 size_t w,
1141 size_t c,
1142 size_t cr,
1143 const uint16_t* k,
1144 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001145 uint16_t* packed_w,
1146 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001147{
1148 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1149 const size_t cr_block_size = min(c - cr_block_start, cr);
1150 if XNN_LIKELY(b != NULL) {
1151 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1152 *packed_w++ = b[cr_block_start + cr_block_offset];
1153 }
1154 } else {
1155 size_t n = cr_block_size;
1156 do {
1157 *packed_w++ = 0;
1158 } while (--n != 0);
1159 }
1160 packed_w += cr - cr_block_size;
1161 for (size_t x = 0; x < w; x++) {
1162 for (size_t y = 0; y < h; y++) {
1163 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1164 const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1165 *packed_w++ = kv;
1166 }
1167 packed_w += cr - cr_block_size;
1168 }
1169 }
1170 }
1171}
1172
Marat Dukhan08b7a972020-07-14 18:17:29 -07001173void xnn_pack_qu8_dwconv_hwg_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001174 size_t h,
1175 size_t w,
1176 size_t c,
1177 size_t cr,
Marat Dukhanab582382020-07-06 13:32:08 -07001178 const uint8_t* k,
1179 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001180 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001181 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001182{
Marat Dukhanb42f8662020-07-06 20:46:13 -07001183 const int32_t izp = (int32_t) params->input_zero_point;
1184 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001185 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1186 const size_t cr_block_size = min(c - cr_block_start, cr);
1187 int32_t* packed_b = (int32_t*) packed_w;
1188 if XNN_LIKELY(b != NULL) {
1189 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1190 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1191 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1192 }
1193 } else {
1194 size_t n = cr_block_size;
1195 do {
1196 *((int32_t*) packed_w) = boff;
1197 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1198 } while (--n != 0);
1199 }
1200 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1201 for (size_t x = 0; x < w; x++) {
1202 for (size_t y = 0; y < h; y++) {
1203 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1204 const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
Marat Dukhanb42f8662020-07-06 20:46:13 -07001205 packed_b[cr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001206 *((uint8_t*) packed_w) = kv;
1207 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1208 }
1209 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1210 }
1211 }
1212 }
1213}
1214
Marat Dukhana6879bd2020-07-06 14:25:08 -07001215void xnn_pack_f32_gemminc_goi_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001216 size_t g,
1217 size_t nc,
1218 size_t kc,
1219 size_t nr,
1220 size_t kr,
1221 size_t sr,
Marat Dukhana6879bd2020-07-06 14:25:08 -07001222 const float* k,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001223 float* packed_w,
1224 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001225{
1226 const size_t skr = sr * kr;
1227 const size_t skc = round_down_po2(kc, skr);
1228 const size_t sr_mask = (sr - 1) * kr;
1229 do {
1230 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1231 const size_t nr_block_size = min(nc - nr_block_start, nr);
Marat Dukhanab582382020-07-06 13:32:08 -07001232
1233 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1234 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1235 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1236 *packed_w++ =
1237 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1238 }
1239 }
1240 packed_w += (nr - nr_block_size) * kr;
1241 }
1242
1243 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1244 const size_t kr_block_size = min(kc - kr_block_start, kr);
1245 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1246 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1247 *packed_w++ =
1248 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1249 }
1250 packed_w += kr - kr_block_size;
1251 }
1252 packed_w += (nr - nr_block_size) * kr;
1253 }
1254 }
1255 k += nc * kc;
Marat Dukhanab582382020-07-06 13:32:08 -07001256 } while (--g != 0);
1257}
1258
Marat Dukhanab582382020-07-06 13:32:08 -07001259void xnn_pack_f16_gemminc_goi_w(
1260 size_t g,
1261 size_t nc,
1262 size_t kc,
1263 size_t nr,
1264 size_t kr,
1265 size_t sr,
1266 const uint16_t* k,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001267 uint16_t* packed_w,
1268 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001269{
1270 const size_t skr = sr * kr;
1271 const size_t skc = round_down_po2(kc, skr);
1272 const size_t sr_mask = (sr - 1) * kr;
1273 do {
1274 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1275 const size_t nr_block_size = min(nc - nr_block_start, nr);
1276
1277 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1278 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1279 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1280 *packed_w++ =
1281 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1282 }
1283 }
1284 packed_w += (nr - nr_block_size) * kr;
1285 }
1286
1287 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1288 const size_t kr_block_size = min(kc - kr_block_start, kr);
1289 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1290 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1291 *packed_w++ =
1292 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1293 }
1294 packed_w += kr - kr_block_size;
1295 }
1296 packed_w += (nr - nr_block_size) * kr;
1297 }
1298 }
1299 k += nc * kc;
1300 } while (--g != 0);
1301}
1302
Marat Dukhana6879bd2020-07-06 14:25:08 -07001303void xnn_pack_f32_dconv_oki_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001304 size_t nc,
Marat Dukhanab582382020-07-06 13:32:08 -07001305 size_t kc,
1306 size_t nr,
Marat Dukhana6879bd2020-07-06 14:25:08 -07001307 size_t kh,
1308 size_t kw,
1309 const float* k,
1310 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001311 float* packed_w,
1312 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001313{
Marat Dukhana6879bd2020-07-06 14:25:08 -07001314 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1315 const size_t nr_block_size = min(nc - nr_block_start, nr);
1316 if XNN_LIKELY(b != NULL) {
1317 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1318 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
Marat Dukhanab582382020-07-06 13:32:08 -07001319 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001320 } else {
1321 size_t n = nr;
1322 do {
1323 *packed_w++ = 0.0f;
1324 } while (--n != 0);
1325 }
Marat Dukhanab582382020-07-06 13:32:08 -07001326
Marat Dukhana6879bd2020-07-06 14:25:08 -07001327 for (size_t kx = 0; kx < kw; kx++) {
1328 for (size_t c = 0; c < kc; c++) {
1329 for (size_t ky = 0; ky < kh; ky++) {
1330 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1331 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
Marat Dukhanab582382020-07-06 13:32:08 -07001332 }
Marat Dukhanab582382020-07-06 13:32:08 -07001333 }
1334 }
1335 }
Marat Dukhanab582382020-07-06 13:32:08 -07001336 if XNN_UNPREDICTABLE(b != NULL) {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001337 b += nr;
Marat Dukhanab582382020-07-06 13:32:08 -07001338 }
1339 }
1340}
1341
1342void xnn_pack_f16_dconv_oki_w(
1343 size_t nc,
1344 size_t kc,
1345 size_t nr,
1346 size_t kh,
1347 size_t kw,
1348 const uint16_t* k,
1349 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001350 uint16_t* packed_w,
1351 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001352{
1353 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1354 const size_t nr_block_size = min(nc - nr_block_start, nr);
1355 if XNN_LIKELY(b != NULL) {
1356 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1357 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1358 }
1359 } else {
1360 size_t n = nr;
1361 do {
1362 *packed_w++ = 0;
1363 } while (--n != 0);
1364 }
1365
1366 for (size_t kx = 0; kx < kw; kx++) {
1367 for (size_t c = 0; c < kc; c++) {
1368 for (size_t ky = 0; ky < kh; ky++) {
1369 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1370 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1371 }
1372 }
1373 }
1374 }
1375 if XNN_UNPREDICTABLE(b != NULL) {
1376 b += nr;
1377 }
1378 }
1379}
1380
Marat Dukhana6879bd2020-07-06 14:25:08 -07001381void xnn_pack_f32_chw_dwconv_ghw_w(
1382 size_t kernel_size,
1383 size_t groups,
1384 const float* kernel,
1385 const float* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001386 float* packed_weights,
1387 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001388{
Marat Dukhana6879bd2020-07-06 14:25:08 -07001389 for (size_t g = 0; g < groups; g++) {
1390 if XNN_LIKELY(bias != NULL) {
1391 *packed_weights = *bias++;
Marat Dukhanab582382020-07-06 13:32:08 -07001392 } else {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001393 *packed_weights = 0.0f;
Marat Dukhanab582382020-07-06 13:32:08 -07001394 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001395 packed_weights += 1;
1396 for (size_t i = 0; i < kernel_size; i++) {
1397 *packed_weights++ = kernel[g * kernel_size + i];
Marat Dukhanab582382020-07-06 13:32:08 -07001398 }
1399 }
1400}
1401
1402void xnn_pack_f16_chw_dwconv_ghw_w(
1403 size_t kernel_size,
1404 size_t groups,
1405 const uint16_t* kernel,
1406 const uint16_t* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001407 uint16_t* packed_weights,
1408 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001409{
1410 for (size_t g = 0; g < groups; g++) {
1411 if XNN_LIKELY(bias != NULL) {
1412 *packed_weights = *bias++;
1413 } else {
1414 *packed_weights = 0;
1415 }
1416 packed_weights += 1;
1417 for (size_t i = 0; i < kernel_size; i++) {
1418 *packed_weights++ = kernel[g * kernel_size + i];
1419 }
1420 }
1421}
1422
Marat Dukhanab582382020-07-06 13:32:08 -07001423void xnn_pack_f32_chw_dwconv_hwg_w(
1424 size_t kernel_size,
1425 size_t groups,
1426 const float* kernel,
1427 const float* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001428 float* packed_weights,
1429 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001430{
1431 for (size_t g = 0; g < groups; g++) {
1432 if XNN_LIKELY(bias != NULL) {
1433 *packed_weights = *bias++;
1434 } else {
1435 *packed_weights = 0.0f;
1436 }
1437 packed_weights += 1;
1438 for (size_t i = 0; i < kernel_size; i++) {
1439 *packed_weights++ = kernel[i * groups + g];
1440 }
1441 }
1442}
1443
1444void xnn_pack_f32_vmulcaddc_w(
1445 size_t c,
1446 size_t cr,
1447 const float* s,
1448 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001449 float* packed_w,
1450 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001451{
1452 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1453 const size_t cr_block_size = min(c - cr_block_start, cr);
1454 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1455 *packed_w++ = s[cr_block_start + cr_block_offset];
1456 }
1457 packed_w += cr - cr_block_size;
1458 if XNN_LIKELY(b != NULL) {
1459 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1460 *packed_w++ = b[cr_block_start + cr_block_offset];
1461 }
1462 } else {
1463 size_t n = cr_block_size;
1464 do {
1465 *packed_w++ = 0.0f;
1466 } while (--n != 0);
1467 }
1468 packed_w += cr - cr_block_size;
1469 }
1470}
1471
1472void xnn_pack_f16_vmulcaddc_w(
1473 size_t c,
1474 size_t cr,
1475 const uint16_t* s,
1476 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001477 uint16_t* packed_w,
1478 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001479{
1480 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1481 const size_t cr_block_size = min(c - cr_block_start, cr);
1482 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1483 *packed_w++ = s[cr_block_start + cr_block_offset];
1484 }
1485 packed_w += cr - cr_block_size;
1486 if XNN_LIKELY(b != NULL) {
1487 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1488 *packed_w++ = b[cr_block_start + cr_block_offset];
1489 }
1490 } else {
1491 size_t n = cr_block_size;
1492 do {
1493 *packed_w++ = 0;
1494 } while (--n != 0);
1495 }
1496 packed_w += cr - cr_block_size;
1497 }
1498}