blob: 63619dcad620a26b5bbe14d63432b8b3320b36cf [file] [log] [blame]
Marat Dukhanab582382020-07-06 13:32:08 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <stdint.h>
10#include <stddef.h>
Frank Barchard952cb512021-10-28 11:39:07 -070011#include <stdio.h> // for printf
Marat Dukhanab582382020-07-06 13:32:08 -070012
13#include <xnnpack/math.h>
14#include <xnnpack/pack.h>
15
16
Marat Dukhana6879bd2020-07-06 14:25:08 -070017void xnn_pack_f32_gemm_goi_w(
18 size_t g,
19 size_t nc,
20 size_t kc,
21 size_t nr,
22 size_t kr,
23 size_t sr,
24 const float* k,
25 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -070026 float* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -070027 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -070028 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -070029{
30 const size_t skr = sr * kr;
31 const size_t skc = round_down_po2(kc, skr);
32 const size_t sr_mask = (sr - 1) * kr;
33 do {
34 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
35 const size_t nr_block_size = min(nc - nr_block_start, nr);
36 if XNN_LIKELY(b != NULL) {
37 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
38 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
39 }
40 }
41 packed_w += nr;
42
43 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
44 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
45 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
46 *packed_w++ =
47 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
48 }
49 }
50 packed_w += (nr - nr_block_size) * kr;
51 }
52
53 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
54 const size_t kr_block_size = min(kc - kr_block_start, kr);
55 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
56 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
57 *packed_w++ =
58 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
59 }
60 packed_w += kr - kr_block_size;
61 }
62 packed_w += (nr - nr_block_size) * kr;
63 }
Marat Dukhane06c8132021-06-03 08:59:11 -070064 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -070065 }
66 k += nc * kc;
67 if XNN_UNPREDICTABLE(b != NULL) {
68 b += nc;
69 }
70 } while (--g != 0);
71}
72
73void xnn_pack_f16_gemm_goi_w(
74 size_t g,
75 size_t nc,
76 size_t kc,
77 size_t nr,
78 size_t kr,
79 size_t sr,
80 const uint16_t* k,
81 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -070082 uint16_t* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -070083 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -070084 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -070085{
86 const size_t skr = sr * kr;
87 const size_t skc = round_down_po2(kc, skr);
88 const size_t sr_mask = (sr - 1) * kr;
89 do {
90 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
91 const size_t nr_block_size = min(nc - nr_block_start, nr);
92 if XNN_LIKELY(b != NULL) {
93 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
94 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
95 }
96 }
97 packed_w += nr;
98
99 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
100 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
101 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
102 *packed_w++ =
103 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
104 }
105 }
106 packed_w += (nr - nr_block_size) * kr;
107 }
108
109 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
110 const size_t kr_block_size = min(kc - kr_block_start, kr);
111 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
112 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
113 *packed_w++ =
114 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
115 }
116 packed_w += kr - kr_block_size;
117 }
118 packed_w += (nr - nr_block_size) * kr;
119 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700120 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -0700121 }
122 k += nc * kc;
123 if XNN_UNPREDICTABLE(b != NULL) {
124 b += nc;
125 }
126 } while (--g != 0);
127}
128
Marat Dukhan08b7a972020-07-14 18:17:29 -0700129void xnn_pack_qu8_gemm_goi_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700130 size_t g,
131 size_t nc,
132 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700133 size_t nr,
134 size_t kr,
Marat Dukhanbc08f312020-07-07 16:22:04 -0700135 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700136 const uint8_t* k,
137 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700138 void* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -0700139 size_t extra_bytes,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700140 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700141{
Marat Dukhanbc08f312020-07-07 16:22:04 -0700142 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700143 const int32_t izp = (int32_t) params->input_zero_point;
144 const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700145 do {
146 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
147 const size_t nr_block_size = min(nc - nr_block_start, nr);
148 int32_t* packed_b = (int32_t*) packed_w;
149 if XNN_LIKELY(b != NULL) {
150 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
151 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
152 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
153 }
154 } else {
155 size_t n = nr_block_size;
156 do {
157 *((int32_t*) packed_w) = boff;
158 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
159 } while (--n != 0);
160 }
161 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
162 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
163 const size_t kr_block_size = min(kc - kr_block_start, kr);
164 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
165 int32_t ksum = 0;
166 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
167 const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
168 ksum += (int32_t) kv;
169 *((uint8_t*) packed_w) = kv;
170 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
171 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700172 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700173 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
174 }
175 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
176 }
Marat Dukhan0b043742021-06-02 18:29:11 -0700177 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanab582382020-07-06 13:32:08 -0700178 }
179 k += nc * kc;
180 if XNN_UNPREDICTABLE(b != NULL) {
181 b += nc;
182 }
183 } while (--g != 0);
184}
185
Marat Dukhan595e1702020-07-31 10:12:52 -0700186void xnn_pack_qs8_gemm_goi_w(
187 size_t g,
188 size_t nc,
189 size_t kc,
190 size_t nr,
191 size_t kr,
192 size_t sr,
193 const int8_t* k,
194 const int32_t* b,
195 void* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -0700196 size_t extra_bytes,
Marat Dukhan595e1702020-07-31 10:12:52 -0700197 const struct xnn_qs8_packing_params* params)
198{
Frank Barchard952cb512021-10-28 11:39:07 -0700199 const size_t skr = sr * kr;
200 const size_t skc = round_down_po2(kc, skr);
201 const size_t sr_mask = (sr - 1) * kr;
Marat Dukhan595e1702020-07-31 10:12:52 -0700202 const int32_t izp = (int32_t) params->input_zero_point;
203 do {
204 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
205 const size_t nr_block_size = min(nc - nr_block_start, nr);
206 int32_t* packed_b = (int32_t*) packed_w;
207 if XNN_LIKELY(b != NULL) {
208 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
209 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
210 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
211 }
212 } else {
213 size_t n = nr_block_size;
214 do {
215 *((int32_t*) packed_w) = 0;
216 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
217 } while (--n != 0);
218 }
219 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
Frank Barchard952cb512021-10-28 11:39:07 -0700220
221 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
222 const size_t kr_block_size = min(kc - kr_block_start, kr);
223 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
224 int32_t ksum = 0;
225 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
226 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset)];
227 ksum += (int32_t) kv;
228 *((int8_t*) packed_w) = kv;
229 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
230 }
231 packed_b[nr_block_offset] -= ksum * izp;
232 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
233 }
234 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
235 }
236
237 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
Marat Dukhan595e1702020-07-31 10:12:52 -0700238 const size_t kr_block_size = min(kc - kr_block_start, kr);
239 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
240 int32_t ksum = 0;
241 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
242 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
243 ksum += (int32_t) kv;
244 *((int8_t*) packed_w) = kv;
245 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
246 }
247 packed_b[nr_block_offset] -= ksum * izp;
248 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
249 }
250 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
251 }
Marat Dukhan0b043742021-06-02 18:29:11 -0700252 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhan595e1702020-07-31 10:12:52 -0700253 }
254 k += nc * kc;
255 if XNN_UNPREDICTABLE(b != NULL) {
256 b += nc;
257 }
258 } while (--g != 0);
259}
260
Marat Dukhan683fab32020-08-03 19:42:52 -0700261void xnn_pack_qs8_gemm_xw_goi_w(
262 size_t g,
263 size_t nc,
264 size_t kc,
265 size_t nr,
266 size_t kr,
267 size_t sr,
268 const int8_t* k,
269 const int32_t* b,
270 void* packed_w,
Marat Dukhan0b043742021-06-02 18:29:11 -0700271 size_t extra_bytes,
Marat Dukhan683fab32020-08-03 19:42:52 -0700272 const struct xnn_qs8_packing_params* params)
273{
Frank Barchard66ae2572021-11-02 17:36:21 -0700274 const size_t skr = sr * kr;
275 const size_t skc = round_down_po2(kc, skr);
276 const size_t sr_mask = (sr - 1) * kr;
Marat Dukhan683fab32020-08-03 19:42:52 -0700277 const int32_t izp = (int32_t) params->input_zero_point;
278 do {
279 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
280 const size_t nr_block_size = min(nc - nr_block_start, nr);
281 int32_t* packed_b = (int32_t*) packed_w;
282 if XNN_LIKELY(b != NULL) {
283 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
284 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
285 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
286 }
287 } else {
288 size_t n = nr_block_size;
289 do {
290 *((int32_t*) packed_w) = 0;
291 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
292 } while (--n != 0);
293 }
294 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
Frank Barchard66ae2572021-11-02 17:36:21 -0700295
296 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
297 const size_t kr_block_size = min(kc - kr_block_start, kr);
298 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
299 int32_t ksum = 0;
300 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
301 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset)];
302 ksum += (int32_t) kv;
303 *((int16_t*) packed_w) = (int16_t) kv;
304 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int16_t));
305 }
306 packed_b[nr_block_offset] -= ksum * izp;
307 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int16_t));
308 }
309 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int16_t));
310 }
311
312 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
Marat Dukhan683fab32020-08-03 19:42:52 -0700313 const size_t kr_block_size = min(kc - kr_block_start, kr);
314 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
315 int32_t ksum = 0;
316 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
317 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
318 ksum += (int32_t) kv;
319 *((int16_t*) packed_w) = (int16_t) kv;
320 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int16_t));
321 }
322 packed_b[nr_block_offset] -= ksum * izp;
323 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int16_t));
324 }
325 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int16_t));
326 }
Marat Dukhan0b043742021-06-02 18:29:11 -0700327 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhan683fab32020-08-03 19:42:52 -0700328 }
329 k += nc * kc;
330 if XNN_UNPREDICTABLE(b != NULL) {
331 b += nc;
332 }
333 } while (--g != 0);
334}
335
Marat Dukhana6879bd2020-07-06 14:25:08 -0700336void xnn_pack_f32_gemm_io_w(
337 size_t nc,
338 size_t kc,
339 size_t nr,
340 size_t kr,
341 size_t sr,
342 const float* k,
343 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700344 float* packed_w,
345 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700346{
347 const size_t skr = sr * kr;
348 const size_t skc = round_down_po2(kc, skr);
349 const size_t sr_mask = (sr - 1) * kr;
350 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
351 const size_t nr_block_size = min(nc - nr_block_start, nr);
352 if XNN_LIKELY(b != NULL) {
353 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
354 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
355 }
356 }
357 packed_w += nr;
358
359 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
360 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
361 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
362 *packed_w++ =
363 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
364 }
365 }
366 packed_w += (nr - nr_block_size) * kr;
367 }
368
369 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
370 const size_t kr_block_size = min(kc - kr_block_start, kr);
371 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
372 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
373 *packed_w++ =
374 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
375 }
376 packed_w += kr - kr_block_size;
377 }
378 packed_w += (nr - nr_block_size) * kr;
379 }
380 }
381}
382
383void xnn_pack_f16_gemm_io_w(
384 size_t nc,
385 size_t kc,
386 size_t nr,
387 size_t kr,
388 size_t sr,
389 const uint16_t* k,
390 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700391 uint16_t* packed_w,
392 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700393{
394 const size_t skr = sr * kr;
395 const size_t skc = round_down_po2(kc, skr);
396 const size_t sr_mask = (sr - 1) * kr;
397 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
398 const size_t nr_block_size = min(nc - nr_block_start, nr);
399 if XNN_LIKELY(b != NULL) {
400 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
401 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
402 }
403 }
404 packed_w += nr;
405
406 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
407 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
408 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
409 *packed_w++ =
410 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
411 }
412 }
413 packed_w += (nr - nr_block_size) * kr;
414 }
415
416 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
417 const size_t kr_block_size = min(kc - kr_block_start, kr);
418 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
419 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
420 *packed_w++ =
421 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
422 }
423 packed_w += kr - kr_block_size;
424 }
425 packed_w += (nr - nr_block_size) * kr;
426 }
427 }
428}
429
Marat Dukhan08b7a972020-07-14 18:17:29 -0700430void xnn_pack_qu8_gemm_io_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700431 size_t nc,
432 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700433 size_t nr,
434 size_t kr,
Marat Dukhanbc08f312020-07-07 16:22:04 -0700435 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700436 const uint8_t* k,
437 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700438 void* packed_w,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700439 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700440{
Marat Dukhanbc08f312020-07-07 16:22:04 -0700441 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700442 const int32_t izp = (int32_t) params->input_zero_point;
443 const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700444 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
445 const size_t nr_block_size = min(nc - nr_block_start, nr);
446 int32_t* packed_b = (int32_t*) packed_w;
447 if XNN_LIKELY(b != NULL) {
448 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
449 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
450 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
451 }
452 } else {
453 size_t n = nr_block_size;
454 do {
455 *((int32_t*) packed_w) = boff;
456 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
457 } while (--n != 0);
458 }
459 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
460 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
461 const size_t kr_block_size = min(kc - kr_block_start, kr);
462 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
463 int32_t ksum = 0;
464 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
465 const uint8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
466 ksum += (int32_t) kv;
467 *((uint8_t*) packed_w) = kv;
468 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
469 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700470 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700471 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
472 }
473 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
474 }
475 }
476}
477
Marat Dukhand23cb6e2021-04-01 01:18:58 -0700478void xnn_pack_qs8_gemm_io_w(
479 size_t nc,
480 size_t kc,
481 size_t nr,
482 size_t kr,
483 size_t sr,
484 const int8_t* k,
485 const int32_t* b,
486 void* packed_w,
487 const struct xnn_qs8_packing_params* params)
488{
Frank Barchard66ae2572021-11-02 17:36:21 -0700489 const size_t skr = sr * kr;
490 const size_t skc = round_down_po2(kc, skr);
491 const size_t sr_mask = (sr - 1) * kr;
Marat Dukhand23cb6e2021-04-01 01:18:58 -0700492 const int32_t izp = (int32_t) params->input_zero_point;
493 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
494 const size_t nr_block_size = min(nc - nr_block_start, nr);
495 int32_t* packed_b = (int32_t*) packed_w;
496 if XNN_LIKELY(b != NULL) {
497 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
498 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
499 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
500 }
501 } else {
502 size_t n = nr_block_size;
503 do {
504 *((int32_t*) packed_w) = 0;
505 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
506 } while (--n != 0);
507 }
508 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
Frank Barchard66ae2572021-11-02 17:36:21 -0700509
510 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
511 const size_t kr_block_size = min(kc - kr_block_start, kr);
512 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
513 int32_t ksum = 0;
514 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
515 const int8_t kv = k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
516 ksum += (int32_t) kv;
517 *((int8_t*) packed_w) = kv;
518 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
519 }
520 packed_b[nr_block_offset] -= ksum * izp;
521 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
522 }
523 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
524 }
525
526 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
Marat Dukhand23cb6e2021-04-01 01:18:58 -0700527 const size_t kr_block_size = min(kc - kr_block_start, kr);
528 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
529 int32_t ksum = 0;
530 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
531 const int8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
532 ksum += (int32_t) kv;
533 *((int8_t*) packed_w) = kv;
534 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
535 }
536 packed_b[nr_block_offset] -= ksum * izp;
537 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
538 }
539 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
540 }
541 }
542}
543
Marat Dukhana6879bd2020-07-06 14:25:08 -0700544void xnn_pack_f32_conv_goki_w(
545 size_t g,
546 size_t nc,
547 size_t ks,
548 size_t kc,
549 size_t nr,
550 size_t kr,
551 size_t sr,
552 const float* k,
553 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700554 float* packed_w,
Marat Dukhane06c8132021-06-03 08:59:11 -0700555 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700556 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700557{
558 const size_t skr = sr * kr;
559 const size_t skc = round_down_po2(kc, skr);
560 const size_t sr_mask = (sr - 1) * kr;
561 do {
562 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
563 const size_t nr_block_size = min(nc - nr_block_start, nr);
564 if XNN_LIKELY(b != NULL) {
565 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
566 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
567 }
568 }
569 packed_w += nr;
570
571 for (size_t ki = 0; ki < ks; ki++) {
572 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
573 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
574 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
575 *packed_w++ =
576 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
577 }
578 }
579 packed_w += (nr - nr_block_size) * kr;
580 }
581
582 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
583 const size_t kr_block_size = min(kc - kr_block_start, kr);
584 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
585 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
586 *packed_w++ =
587 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
588 }
589 packed_w += kr - kr_block_size;
590 }
591 packed_w += (nr - nr_block_size) * kr;
592 }
593 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700594 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -0700595 }
596 k += ks * kc * nc;
597 if XNN_UNPREDICTABLE(b != NULL) {
598 b += nc;
599 }
600 } while (--g != 0);
601}
602
603void xnn_pack_f16_conv_goki_w(
604 size_t g,
605 size_t nc,
606 size_t ks,
607 size_t kc,
608 size_t nr,
609 size_t kr,
610 size_t sr,
611 const uint16_t* k,
612 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700613 uint16_t* packed_w,
Marat Dukhane06c8132021-06-03 08:59:11 -0700614 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700615 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700616{
617 const size_t skr = sr * kr;
618 const size_t skc = round_down_po2(kc, skr);
619 const size_t sr_mask = (sr - 1) * kr;
620 do {
621 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
622 const size_t nr_block_size = min(nc - nr_block_start, nr);
623 if XNN_LIKELY(b != NULL) {
624 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
625 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
626 }
627 }
628 packed_w += nr;
629
630 for (size_t ki = 0; ki < ks; ki++) {
631 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
632 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
633 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
634 *packed_w++ =
635 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
636 }
637 }
638 packed_w += (nr - nr_block_size) * kr;
639 }
640
641 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
642 const size_t kr_block_size = min(kc - kr_block_start, kr);
643 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
644 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
645 *packed_w++ =
646 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
647 }
648 packed_w += kr - kr_block_size;
649 }
650 packed_w += (nr - nr_block_size) * kr;
651 }
652 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700653 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -0700654 }
655 k += ks * kc * nc;
656 if XNN_UNPREDICTABLE(b != NULL) {
657 b += nc;
658 }
659 } while (--g != 0);
660}
661
Marat Dukhan08b7a972020-07-14 18:17:29 -0700662void xnn_pack_qu8_conv_goki_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700663 size_t g,
664 size_t nc,
665 size_t ks,
666 size_t kc,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700667 size_t nr,
668 size_t kr,
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700669 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -0700670 const uint8_t* k,
671 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700672 void* packed_w,
Marat Dukhane06c8132021-06-03 08:59:11 -0700673 size_t extra_bytes,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700674 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700675{
Marat Dukhan5a698bb2020-07-07 20:47:55 -0700676 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -0700677 const int32_t izp = (int32_t) params->input_zero_point;
678 const int32_t boff = (int32_t) ks * (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700679 do {
680 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
681 const size_t nr_block_size = min(nc - nr_block_start, nr);
682 int32_t* packed_b = (int32_t*) packed_w;
683 if XNN_LIKELY(b != NULL) {
684 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
685 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
686 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
687 }
688 } else {
689 size_t n = nr_block_size;
690 do {
691 *((int32_t*) packed_w) = boff;
692 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
693 } while (--n != 0);
694 }
695 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
696 for (size_t ki = 0; ki < ks; ki++) {
697 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
698 const size_t kr_block_size = min(kc - kr_block_start, kr);
699 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
700 int32_t ksum = 0;
701 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
702 const uint8_t kv =
703 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
704 ksum += (int32_t) kv;
705 *((uint8_t*) packed_w) = kv;
706 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
707 }
Marat Dukhanb42f8662020-07-06 20:46:13 -0700708 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700709 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
710 }
711 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
712 }
713 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700714 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanab582382020-07-06 13:32:08 -0700715 }
716 k += ks * kc * nc;
717 if XNN_UNPREDICTABLE(b != NULL) {
718 b += nc;
719 }
720 } while (--g != 0);
721}
722
Marat Dukhanf9480682020-07-31 14:50:24 -0700723void xnn_pack_qs8_conv_goki_w(
724 size_t g,
725 size_t nc,
726 size_t ks,
727 size_t kc,
728 size_t nr,
729 size_t kr,
730 size_t sr,
731 const int8_t* k,
732 const int32_t* b,
733 void* packed_w,
Marat Dukhane06c8132021-06-03 08:59:11 -0700734 size_t extra_bytes,
Marat Dukhanf9480682020-07-31 14:50:24 -0700735 const struct xnn_qs8_packing_params* params)
736{
Frank Barchard952cb512021-10-28 11:39:07 -0700737 const size_t skr = sr * kr;
738 const size_t skc = round_down_po2(kc, skr);
739 const size_t sr_mask = (sr - 1) * kr;
Marat Dukhanf9480682020-07-31 14:50:24 -0700740 const int32_t izp = (int32_t) params->input_zero_point;
741 do {
742 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
743 const size_t nr_block_size = min(nc - nr_block_start, nr);
744 int32_t* packed_b = (int32_t*) packed_w;
745 if XNN_LIKELY(b != NULL) {
746 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
747 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
748 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
749 }
750 } else {
751 size_t n = nr_block_size;
752 do {
753 *((int32_t*) packed_w) = 0;
754 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
755 } while (--n != 0);
756 }
757 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
Frank Barchard952cb512021-10-28 11:39:07 -0700758
Marat Dukhanf9480682020-07-31 14:50:24 -0700759 for (size_t ki = 0; ki < ks; ki++) {
Frank Barchard952cb512021-10-28 11:39:07 -0700760 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
761 const size_t kr_block_size = min(kc - kr_block_start, kr);
762 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
763 int32_t ksum = 0;
764 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
765 const int8_t kv =
766 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset)];
767 ksum += (int32_t) kv;
768 *((int8_t*) packed_w) = kv;
769 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
770 }
771 packed_b[nr_block_offset] -= ksum * izp;
772 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
773 }
774 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
775 }
776
777 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
Marat Dukhanf9480682020-07-31 14:50:24 -0700778 const size_t kr_block_size = min(kc - kr_block_start, kr);
779 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
780 int32_t ksum = 0;
781 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
782 const int8_t kv =
783 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
784 ksum += (int32_t) kv;
785 *((int8_t*) packed_w) = kv;
786 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
787 }
788 packed_b[nr_block_offset] -= ksum * izp;
789 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
790 }
791 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
792 }
Frank Barchard952cb512021-10-28 11:39:07 -0700793
Marat Dukhanf9480682020-07-31 14:50:24 -0700794 }
Marat Dukhane06c8132021-06-03 08:59:11 -0700795 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanf9480682020-07-31 14:50:24 -0700796 }
797 k += ks * kc * nc;
798 if XNN_UNPREDICTABLE(b != NULL) {
799 b += nc;
800 }
801 } while (--g != 0);
802}
803
Marat Dukhana6879bd2020-07-06 14:25:08 -0700804void xnn_pack_f32_conv_kgo_w(
805 size_t g,
806 size_t nc,
807 size_t ks,
808 size_t nr,
809 size_t kr,
810 const float* k,
811 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700812 float* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -0700813 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700814 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700815{
816 for (size_t i = 0; i < g; i++) {
817 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
818 const size_t nr_block_size = min(nc - nr_block_start, nr);
819 if XNN_LIKELY(b != NULL) {
820 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
821 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
822 }
823 }
824 packed_w += nr;
825 for (size_t ki = 0; ki < ks; ki++) {
826 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
827 *packed_w =
828 k[ki * g * nc + (nr_block_start + nr_block_offset)];
829 packed_w += kr;
830 }
831 packed_w += (nr - nr_block_size) * kr;
832 }
Marat Dukhan97262462021-06-18 16:14:17 -0700833 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -0700834 }
835 k += nc;
836 if XNN_UNPREDICTABLE(b != NULL) {
837 b += nc;
838 }
839 }
840}
841
842void xnn_pack_f16_conv_kgo_w(
843 size_t g,
844 size_t nc,
845 size_t ks,
846 size_t nr,
847 size_t kr,
848 const uint16_t* k,
849 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700850 uint16_t* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -0700851 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700852 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700853{
854 for (size_t i = 0; i < g; i++) {
855 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
856 const size_t nr_block_size = min(nc - nr_block_start, nr);
857 if XNN_LIKELY(b != NULL) {
858 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
859 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
860 }
861 }
862 packed_w += nr;
863 for (size_t ki = 0; ki < ks; ki++) {
864 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
865 *packed_w =
866 k[ki * g * nc + (nr_block_start + nr_block_offset)];
867 packed_w += kr;
868 }
869 packed_w += (nr - nr_block_size) * kr;
870 }
Marat Dukhan97262462021-06-18 16:14:17 -0700871 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -0700872 }
873 k += nc;
874 if XNN_UNPREDICTABLE(b != NULL) {
875 b += nc;
876 }
877 }
878}
879
Marat Dukhan08b7a972020-07-14 18:17:29 -0700880void xnn_pack_qu8_conv_kgo_w(
Marat Dukhanab582382020-07-06 13:32:08 -0700881 size_t g,
882 size_t nc,
883 size_t ks,
Marat Dukhana6879bd2020-07-06 14:25:08 -0700884 size_t nr,
885 size_t kr,
Marat Dukhanab582382020-07-06 13:32:08 -0700886 const uint8_t* k,
887 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700888 void* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -0700889 size_t extra_bytes,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700890 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -0700891{
Marat Dukhanb42f8662020-07-06 20:46:13 -0700892 const int32_t izp = (int32_t) params->input_zero_point;
893 const int32_t boff = (int32_t) ks * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -0700894 for (size_t i = 0; i < g; i++) {
895 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
896 const size_t nr_block_size = min(nc - nr_block_start, nr);
897 int32_t* packed_b = (int32_t*) packed_w;
898 if XNN_LIKELY(b != NULL) {
899 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
900 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
901 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
902 }
903 } else {
904 size_t n = nr_block_size;
905 do {
906 *((int32_t*) packed_w) = boff;
907 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
908 } while (--n != 0);
909 }
910 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
911 for (size_t ki = 0; ki < ks; ki++) {
912 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
913 const uint8_t kv =
914 k[ki * g * nc + (nr_block_start + nr_block_offset)];
915 *((uint8_t*) packed_w) = kv;
Marat Dukhanb42f8662020-07-06 20:46:13 -0700916 packed_b[nr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -0700917 packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(uint8_t));
918 }
919 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
920 }
Marat Dukhan97262462021-06-18 16:14:17 -0700921 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanab582382020-07-06 13:32:08 -0700922 }
923 k += nc;
924 if XNN_UNPREDICTABLE(b != NULL) {
925 b += nc;
926 }
927 }
928}
929
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700930void xnn_pack_qs8_conv_kgo_w(
931 size_t g,
932 size_t nc,
933 size_t ks,
934 size_t nr,
935 size_t kr,
936 const int8_t* k,
937 const int32_t* b,
938 void* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -0700939 size_t extra_bytes,
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700940 const struct xnn_qs8_packing_params* params)
941{
942 const int32_t izp = (int32_t) params->input_zero_point;
943 for (size_t i = 0; i < g; i++) {
944 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
945 const size_t nr_block_size = min(nc - nr_block_start, nr);
946 int32_t* packed_b = (int32_t*) packed_w;
947 if XNN_LIKELY(b != NULL) {
948 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
949 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
950 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
951 }
952 } else {
953 size_t n = nr_block_size;
954 do {
955 *((int32_t*) packed_w) = 0;
956 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
957 } while (--n != 0);
958 }
959 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
960 for (size_t ki = 0; ki < ks; ki++) {
961 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
962 const int8_t kv =
963 k[ki * g * nc + (nr_block_start + nr_block_offset)];
964 *((int8_t*) packed_w) = kv;
965 packed_b[nr_block_offset] -= (int32_t) kv * izp;
966 packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(int8_t));
967 }
968 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
969 }
Marat Dukhan97262462021-06-18 16:14:17 -0700970 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700971 }
972 k += nc;
973 if XNN_UNPREDICTABLE(b != NULL) {
974 b += nc;
975 }
976 }
977}
978
Marat Dukhana6879bd2020-07-06 14:25:08 -0700979void xnn_pack_f32_deconv_goki_w(
980 size_t g,
981 size_t nc,
982 size_t kh,
983 size_t kw,
984 size_t kc,
985 size_t sh,
986 size_t sw,
987 size_t nr,
988 size_t kr,
989 size_t sr,
990 const float* k,
991 const float* b,
992 float* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -0700993 struct subconvolution_params* subconv_params,
994 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -0700995{
996 const size_t skr = sr * kr;
997 const size_t skc = round_down_po2(kc, skr);
998 const size_t sr_mask = (sr - 1) * kr;
999 for (size_t i = 0; i < g; i++) {
1000 for (size_t oy = 0; oy < sh; oy++) {
1001 for (size_t ox = 0; ox < sw; ox++) {
1002 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -07001003 (*subconv_params++).weights = packed_w;
Marat Dukhana6879bd2020-07-06 14:25:08 -07001004 }
1005 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1006 const size_t nr_block_size = min(nc - nr_block_start, nr);
1007 if XNN_LIKELY(b != NULL) {
1008 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1009 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
1010 }
1011 }
1012 packed_w += nr;
1013 for (size_t ky = oy; ky < kh; ky += sh) {
1014 for (size_t kx = ox; kx < kw; kx += sw) {
1015 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1016 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1017 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1018 *packed_w++ =
1019 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1020 }
1021 }
1022 packed_w += (nr - nr_block_size) * kr;
1023 }
1024
1025 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1026 const size_t kr_block_size = min(kc - kr_block_start, kr);
1027 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1028 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1029 *packed_w++ =
1030 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
1031 }
1032 packed_w += kr - kr_block_size;
1033 }
1034 packed_w += (nr - nr_block_size) * kr;
1035 }
1036 }
1037 }
1038 }
1039 }
1040 }
1041 k += kh * kw * kc * nc;
1042 if XNN_UNPREDICTABLE(b != NULL) {
1043 b += nc;
1044 }
1045 }
1046}
1047
1048void xnn_pack_f16_deconv_goki_w(
1049 size_t g,
1050 size_t nc,
1051 size_t kh,
1052 size_t kw,
1053 size_t kc,
1054 size_t sh,
1055 size_t sw,
1056 size_t nr,
1057 size_t kr,
1058 size_t sr,
1059 const uint16_t* k,
1060 const uint16_t* b,
1061 uint16_t* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001062 struct subconvolution_params* subconv_params,
1063 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001064{
1065 const size_t skr = sr * kr;
1066 const size_t skc = round_down_po2(kc, skr);
1067 const size_t sr_mask = (sr - 1) * kr;
1068 for (size_t i = 0; i < g; i++) {
1069 for (size_t oy = 0; oy < sh; oy++) {
1070 for (size_t ox = 0; ox < sw; ox++) {
1071 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -07001072 (*subconv_params++).weights = packed_w;
Marat Dukhana6879bd2020-07-06 14:25:08 -07001073 }
1074 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1075 const size_t nr_block_size = min(nc - nr_block_start, nr);
1076 if XNN_LIKELY(b != NULL) {
1077 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1078 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
1079 }
1080 }
1081 packed_w += nr;
1082 for (size_t ky = oy; ky < kh; ky += sh) {
1083 for (size_t kx = ox; kx < kw; kx += sw) {
Frank Barchard66ae2572021-11-02 17:36:21 -07001084
Marat Dukhana6879bd2020-07-06 14:25:08 -07001085 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1086 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1087 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1088 *packed_w++ =
1089 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1090 }
1091 }
1092 packed_w += (nr - nr_block_size) * kr;
1093 }
1094
1095 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1096 const size_t kr_block_size = min(kc - kr_block_start, kr);
1097 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1098 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1099 *packed_w++ =
1100 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
1101 }
1102 packed_w += kr - kr_block_size;
1103 }
1104 packed_w += (nr - nr_block_size) * kr;
1105 }
1106 }
1107 }
1108 }
1109 }
1110 }
1111 k += kh * kw * kc * nc;
1112 if XNN_UNPREDICTABLE(b != NULL) {
1113 b += nc;
1114 }
1115 }
1116}
1117
Marat Dukhanbea849a2021-07-30 16:25:30 -07001118void xnn_pack_qs8_deconv_goki_w(
1119 size_t g,
1120 size_t nc,
1121 size_t kh,
1122 size_t kw,
1123 size_t kc,
1124 size_t sh,
1125 size_t sw,
1126 size_t nr,
1127 size_t kr,
1128 size_t sr,
1129 const int8_t* k,
1130 const int32_t* b,
1131 void* packed_w,
1132 struct subconvolution_params* subconv_params,
1133 const struct xnn_qs8_packing_params* params)
1134{
Frank Barchard66ae2572021-11-02 17:36:21 -07001135 const size_t skr = sr * kr;
1136 const size_t skc = round_down_po2(kc, skr);
1137 const size_t sr_mask = (sr - 1) * kr;
Marat Dukhanbea849a2021-07-30 16:25:30 -07001138 const int32_t izp = (int32_t) params->input_zero_point;
1139 for (size_t i = 0; i < g; i++) {
1140 for (size_t oy = 0; oy < sh; oy++) {
1141 for (size_t ox = 0; ox < sw; ox++) {
1142 if (i == 0) {
1143 (*subconv_params++).weights = packed_w;
1144 }
1145 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1146 const size_t nr_block_size = min(nc - nr_block_start, nr);
1147 int32_t* packed_b = (int32_t*) packed_w;
1148 if XNN_LIKELY(b != 0) {
1149 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1150 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset];
1151 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1152 }
1153 } else {
1154 size_t n = nr_block_size;
1155 do {
1156 *((int32_t*) packed_w) = 0;
1157 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1158 } while (--n != 0);
1159 }
1160 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1161 for (size_t ky = oy; ky < kh; ky += sh) {
1162 for (size_t kx = ox; kx < kw; kx += sw) {
Frank Barchard66ae2572021-11-02 17:36:21 -07001163
1164 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1165 const size_t kr_block_size = min(kc - kr_block_start, kr);
1166 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1167 int32_t ksum = 0;
1168 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1169 const int8_t kv =
1170 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset)];
1171 ksum += (int32_t) kv;
1172 *((int8_t*) packed_w) = kv;
1173 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1174 }
1175 packed_b[nr_block_offset] -= ksum * izp;
1176 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
1177 }
1178 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
1179 }
1180
1181 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
Marat Dukhanbea849a2021-07-30 16:25:30 -07001182 const size_t kr_block_size = min(kc - kr_block_start, kr);
1183 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1184 int32_t ksum = 0;
1185 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1186 const int8_t kv =
1187 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
1188 ksum += (int32_t) kv;
1189 *((int8_t*) packed_w) = kv;
1190 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1191 }
1192 packed_b[nr_block_offset] -= ksum * izp;
1193 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t));
1194 }
1195 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t));
1196 }
Frank Barchard66ae2572021-11-02 17:36:21 -07001197
Marat Dukhanbea849a2021-07-30 16:25:30 -07001198 }
1199 }
1200 }
1201 }
1202 }
1203 k += kh * kw * kc * nc;
1204 if XNN_UNPREDICTABLE(b != NULL) {
1205 b += nc;
1206 }
1207 }
1208}
1209
Marat Dukhan08b7a972020-07-14 18:17:29 -07001210void xnn_pack_qu8_deconv_goki_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001211 size_t g,
1212 size_t nc,
1213 size_t kh,
1214 size_t kw,
1215 size_t kc,
1216 size_t sh,
1217 size_t sw,
1218 size_t nr,
1219 size_t kr,
Marat Dukhan5a698bb2020-07-07 20:47:55 -07001220 size_t sr,
Marat Dukhanab582382020-07-06 13:32:08 -07001221 const uint8_t* k,
1222 const int32_t* b,
1223 void* packed_w,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001224 struct subconvolution_params* subconv_params,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001225 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001226{
Marat Dukhan5a698bb2020-07-07 20:47:55 -07001227 assert(sr == 1);
Marat Dukhanb42f8662020-07-06 20:46:13 -07001228 const int32_t izp = (int32_t) params->input_zero_point;
1229 const int32_t kzp = (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001230 for (size_t i = 0; i < g; i++) {
1231 for (size_t oy = 0; oy < sh; oy++) {
1232 for (size_t ox = 0; ox < sw; ox++) {
1233 if (i == 0) {
Marat Dukhanb42f8662020-07-06 20:46:13 -07001234 (*subconv_params++).weights = packed_w;
Marat Dukhanab582382020-07-06 13:32:08 -07001235 }
Marat Dukhanb42f8662020-07-06 20:46:13 -07001236 const int32_t boff = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * izp * kzp;
Marat Dukhanab582382020-07-06 13:32:08 -07001237 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1238 const size_t nr_block_size = min(nc - nr_block_start, nr);
1239 int32_t* packed_b = (int32_t*) packed_w;
1240 if XNN_LIKELY(b != 0) {
1241 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1242 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
1243 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1244 }
1245 } else {
1246 size_t n = nr_block_size;
1247 do {
1248 *((int32_t*) packed_w) = boff;
1249 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1250 } while (--n != 0);
1251 }
1252 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1253 for (size_t ky = oy; ky < kh; ky += sh) {
1254 for (size_t kx = ox; kx < kw; kx += sw) {
1255 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
1256 const size_t kr_block_size = min(kc - kr_block_start, kr);
1257 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1258 int32_t ksum = 0;
1259 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1260 const uint8_t kv =
1261 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
1262 ksum += (int32_t) kv;
1263 *((uint8_t*) packed_w) = kv;
1264 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1265 }
Marat Dukhanb42f8662020-07-06 20:46:13 -07001266 packed_b[nr_block_offset] -= ksum * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001267 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
1268 }
1269 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
1270 }
1271 }
1272 }
1273 }
1274 }
1275 }
1276 k += kh * kw * kc * nc;
1277 if XNN_UNPREDICTABLE(b != NULL) {
1278 b += nc;
1279 }
1280 }
1281}
1282
Marat Dukhana6879bd2020-07-06 14:25:08 -07001283void xnn_pack_f32_dwconv_ghw_w(
1284 size_t h,
1285 size_t w,
1286 size_t c,
1287 size_t cr,
1288 const float* k,
1289 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001290 float* packed_w,
Marat Dukhan82286892021-06-04 17:27:27 -07001291 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001292 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001293{
1294 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1295 const size_t cr_block_size = min(c - cr_block_start, cr);
1296 if XNN_LIKELY(b != NULL) {
1297 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1298 *packed_w++ = b[cr_block_start + cr_block_offset];
1299 }
1300 } else {
1301 size_t n = cr_block_size;
1302 do {
1303 *packed_w++ = 0.0f;
1304 } while (--n != 0);
1305 }
1306 packed_w += cr - cr_block_size;
1307 for (size_t x = 0; x < w; x++) {
1308 for (size_t y = 0; y < h; y++) {
1309 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1310 const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1311 *packed_w++ = kv;
1312 }
1313 packed_w += cr - cr_block_size;
1314 }
1315 }
Marat Dukhan82286892021-06-04 17:27:27 -07001316 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -07001317 }
1318}
1319
1320void xnn_pack_f16_dwconv_ghw_w(
1321 size_t h,
1322 size_t w,
1323 size_t c,
1324 size_t cr,
1325 const uint16_t* k,
1326 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001327 uint16_t* packed_w,
Marat Dukhan82286892021-06-04 17:27:27 -07001328 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001329 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001330{
1331 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1332 const size_t cr_block_size = min(c - cr_block_start, cr);
1333 if XNN_LIKELY(b != NULL) {
1334 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1335 *packed_w++ = b[cr_block_start + cr_block_offset];
1336 }
1337 } else {
1338 size_t n = cr_block_size;
1339 do {
1340 *packed_w++ = 0;
1341 } while (--n != 0);
1342 }
1343 packed_w += cr - cr_block_size;
1344 for (size_t x = 0; x < w; x++) {
1345 for (size_t y = 0; y < h; y++) {
1346 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1347 const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1348 *packed_w++ = kv;
1349 }
1350 packed_w += cr - cr_block_size;
1351 }
1352 }
Marat Dukhan82286892021-06-04 17:27:27 -07001353 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -07001354 }
1355}
1356
Marat Dukhan08b7a972020-07-14 18:17:29 -07001357void xnn_pack_qu8_dwconv_ghw_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001358 size_t h,
1359 size_t w,
1360 size_t c,
1361 size_t cr,
Marat Dukhanab582382020-07-06 13:32:08 -07001362 const uint8_t* k,
1363 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001364 void* packed_w,
Marat Dukhan82286892021-06-04 17:27:27 -07001365 size_t extra_bytes,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001366 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001367{
Marat Dukhanb42f8662020-07-06 20:46:13 -07001368 const int32_t izp = (int32_t) params->input_zero_point;
1369 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001370 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1371 const size_t cr_block_size = min(c - cr_block_start, cr);
1372 int32_t* packed_b = (int32_t*) packed_w;
1373 if XNN_LIKELY(b != NULL) {
1374 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1375 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1376 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1377 }
1378 } else {
1379 size_t n = cr_block_size;
1380 do {
1381 *((int32_t*) packed_w) = boff;
1382 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1383 } while (--n != 0);
1384 }
1385 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1386 for (size_t x = 0; x < w; x++) {
1387 for (size_t y = 0; y < h; y++) {
1388 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1389 const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
Marat Dukhanb42f8662020-07-06 20:46:13 -07001390 packed_b[cr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001391 *((uint8_t*) packed_w) = kv;
1392 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1393 }
1394 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1395 }
1396 }
Marat Dukhan82286892021-06-04 17:27:27 -07001397 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanab582382020-07-06 13:32:08 -07001398 }
1399}
1400
Marat Dukhanf62bbdc2020-08-04 13:59:04 -07001401void xnn_pack_qs8_dwconv_ghw_w(
1402 size_t h,
1403 size_t w,
1404 size_t c,
1405 size_t cr,
1406 const int8_t* k,
1407 const int32_t* b,
1408 void* packed_w,
Marat Dukhan82286892021-06-04 17:27:27 -07001409 size_t extra_bytes,
Marat Dukhanf62bbdc2020-08-04 13:59:04 -07001410 const struct xnn_qs8_packing_params* params)
1411{
1412 const int32_t izp = (int32_t) params->input_zero_point;
1413 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1414 const size_t cr_block_size = min(c - cr_block_start, cr);
1415 int32_t* packed_b = (int32_t*) packed_w;
1416 if XNN_LIKELY(b != NULL) {
1417 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1418 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
1419 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1420 }
1421 } else {
1422 size_t n = cr_block_size;
1423 do {
1424 *((int32_t*) packed_w) = 0;
1425 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1426 } while (--n != 0);
1427 }
1428 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1429 for (size_t x = 0; x < w; x++) {
1430 for (size_t y = 0; y < h; y++) {
1431 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1432 const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1433 packed_b[cr_block_offset] -= (int32_t) kv * izp;
1434 *((int8_t*) packed_w) = kv;
1435 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1436 }
1437 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1438 }
1439 }
Marat Dukhan82286892021-06-04 17:27:27 -07001440 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanf62bbdc2020-08-04 13:59:04 -07001441 }
1442}
1443
Marat Dukhana6879bd2020-07-06 14:25:08 -07001444void xnn_pack_f32_dwconv_hwg_w(
1445 size_t h,
1446 size_t w,
1447 size_t c,
1448 size_t cr,
1449 const float* k,
1450 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001451 float* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -07001452 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001453 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001454{
1455 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1456 const size_t cr_block_size = min(c - cr_block_start, cr);
1457 if XNN_LIKELY(b != NULL) {
1458 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1459 *packed_w++ = b[cr_block_start + cr_block_offset];
1460 }
1461 } else {
1462 size_t n = cr_block_size;
1463 do {
1464 *packed_w++ = 0.0f;
1465 } while (--n != 0);
1466 }
1467 packed_w += cr - cr_block_size;
1468 for (size_t x = 0; x < w; x++) {
1469 for (size_t y = 0; y < h; y++) {
1470 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1471 const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1472 *packed_w++ = kv;
1473 }
1474 packed_w += cr - cr_block_size;
1475 }
1476 }
Marat Dukhan97262462021-06-18 16:14:17 -07001477 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -07001478 }
1479}
1480
1481void xnn_pack_f16_dwconv_hwg_w(
1482 size_t h,
1483 size_t w,
1484 size_t c,
1485 size_t cr,
1486 const uint16_t* k,
1487 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001488 uint16_t* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -07001489 size_t extra_bytes,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001490 const void* params)
Marat Dukhana6879bd2020-07-06 14:25:08 -07001491{
1492 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1493 const size_t cr_block_size = min(c - cr_block_start, cr);
1494 if XNN_LIKELY(b != NULL) {
1495 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1496 *packed_w++ = b[cr_block_start + cr_block_offset];
1497 }
1498 } else {
1499 size_t n = cr_block_size;
1500 do {
1501 *packed_w++ = 0;
1502 } while (--n != 0);
1503 }
1504 packed_w += cr - cr_block_size;
1505 for (size_t x = 0; x < w; x++) {
1506 for (size_t y = 0; y < h; y++) {
1507 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1508 const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1509 *packed_w++ = kv;
1510 }
1511 packed_w += cr - cr_block_size;
1512 }
1513 }
Marat Dukhan97262462021-06-18 16:14:17 -07001514 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhana6879bd2020-07-06 14:25:08 -07001515 }
1516}
1517
Marat Dukhan08b7a972020-07-14 18:17:29 -07001518void xnn_pack_qu8_dwconv_hwg_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001519 size_t h,
1520 size_t w,
1521 size_t c,
1522 size_t cr,
Marat Dukhanab582382020-07-06 13:32:08 -07001523 const uint8_t* k,
1524 const int32_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001525 void* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -07001526 size_t extra_bytes,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001527 const struct xnn_qu8_packing_params* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001528{
Marat Dukhanb42f8662020-07-06 20:46:13 -07001529 const int32_t izp = (int32_t) params->input_zero_point;
1530 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
Marat Dukhanab582382020-07-06 13:32:08 -07001531 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1532 const size_t cr_block_size = min(c - cr_block_start, cr);
1533 int32_t* packed_b = (int32_t*) packed_w;
1534 if XNN_LIKELY(b != NULL) {
1535 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1536 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
1537 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1538 }
1539 } else {
1540 size_t n = cr_block_size;
1541 do {
1542 *((int32_t*) packed_w) = boff;
1543 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1544 } while (--n != 0);
1545 }
1546 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1547 for (size_t x = 0; x < w; x++) {
1548 for (size_t y = 0; y < h; y++) {
1549 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1550 const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
Marat Dukhanb42f8662020-07-06 20:46:13 -07001551 packed_b[cr_block_offset] -= (int32_t) kv * izp;
Marat Dukhanab582382020-07-06 13:32:08 -07001552 *((uint8_t*) packed_w) = kv;
1553 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1554 }
1555 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1556 }
1557 }
Marat Dukhan97262462021-06-18 16:14:17 -07001558 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhanab582382020-07-06 13:32:08 -07001559 }
1560}
1561
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001562void xnn_pack_qs8_dwconv_hwg_w(
1563 size_t h,
1564 size_t w,
1565 size_t c,
1566 size_t cr,
1567 const int8_t* k,
1568 const int32_t* b,
1569 void* packed_w,
Marat Dukhan97262462021-06-18 16:14:17 -07001570 size_t extra_bytes,
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001571 const struct xnn_qs8_packing_params* params)
1572{
1573 const int32_t izp = (int32_t) params->input_zero_point;
1574 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1575 const size_t cr_block_size = min(c - cr_block_start, cr);
1576 int32_t* packed_b = (int32_t*) packed_w;
1577 if XNN_LIKELY(b != NULL) {
1578 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1579 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset];
1580 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1581 }
1582 } else {
1583 size_t n = cr_block_size;
1584 do {
1585 *((int32_t*) packed_w) = 0;
1586 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1587 } while (--n != 0);
1588 }
1589 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1590 for (size_t x = 0; x < w; x++) {
1591 for (size_t y = 0; y < h; y++) {
1592 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1593 const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1594 packed_b[cr_block_offset] -= (int32_t) kv * izp;
1595 *((int8_t*) packed_w) = kv;
1596 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1597 }
1598 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1599 }
1600 }
Marat Dukhan97262462021-06-18 16:14:17 -07001601 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001602 }
1603}
1604
Marat Dukhana6879bd2020-07-06 14:25:08 -07001605void xnn_pack_f32_gemminc_goi_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001606 size_t g,
1607 size_t nc,
1608 size_t kc,
1609 size_t nr,
1610 size_t kr,
1611 size_t sr,
Marat Dukhana6879bd2020-07-06 14:25:08 -07001612 const float* k,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001613 float* packed_w,
1614 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001615{
1616 const size_t skr = sr * kr;
1617 const size_t skc = round_down_po2(kc, skr);
1618 const size_t sr_mask = (sr - 1) * kr;
1619 do {
1620 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1621 const size_t nr_block_size = min(nc - nr_block_start, nr);
Marat Dukhanab582382020-07-06 13:32:08 -07001622
1623 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1624 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1625 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1626 *packed_w++ =
1627 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1628 }
1629 }
1630 packed_w += (nr - nr_block_size) * kr;
1631 }
1632
1633 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1634 const size_t kr_block_size = min(kc - kr_block_start, kr);
1635 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1636 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1637 *packed_w++ =
1638 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1639 }
1640 packed_w += kr - kr_block_size;
1641 }
1642 packed_w += (nr - nr_block_size) * kr;
1643 }
1644 }
1645 k += nc * kc;
Marat Dukhanab582382020-07-06 13:32:08 -07001646 } while (--g != 0);
1647}
1648
Marat Dukhanab582382020-07-06 13:32:08 -07001649void xnn_pack_f16_gemminc_goi_w(
1650 size_t g,
1651 size_t nc,
1652 size_t kc,
1653 size_t nr,
1654 size_t kr,
1655 size_t sr,
1656 const uint16_t* k,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001657 uint16_t* packed_w,
1658 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001659{
1660 const size_t skr = sr * kr;
1661 const size_t skc = round_down_po2(kc, skr);
1662 const size_t sr_mask = (sr - 1) * kr;
1663 do {
1664 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1665 const size_t nr_block_size = min(nc - nr_block_start, nr);
1666
1667 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1668 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1669 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1670 *packed_w++ =
1671 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1672 }
1673 }
1674 packed_w += (nr - nr_block_size) * kr;
1675 }
1676
1677 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
1678 const size_t kr_block_size = min(kc - kr_block_start, kr);
1679 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1680 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1681 *packed_w++ =
1682 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
1683 }
1684 packed_w += kr - kr_block_size;
1685 }
1686 packed_w += (nr - nr_block_size) * kr;
1687 }
1688 }
1689 k += nc * kc;
1690 } while (--g != 0);
1691}
1692
Marat Dukhana6879bd2020-07-06 14:25:08 -07001693void xnn_pack_f32_dconv_oki_w(
Marat Dukhanab582382020-07-06 13:32:08 -07001694 size_t nc,
Marat Dukhanab582382020-07-06 13:32:08 -07001695 size_t kc,
1696 size_t nr,
Marat Dukhana6879bd2020-07-06 14:25:08 -07001697 size_t kh,
1698 size_t kw,
1699 const float* k,
1700 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001701 float* packed_w,
1702 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001703{
Marat Dukhana6879bd2020-07-06 14:25:08 -07001704 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1705 const size_t nr_block_size = min(nc - nr_block_start, nr);
1706 if XNN_LIKELY(b != NULL) {
1707 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1708 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
Marat Dukhanab582382020-07-06 13:32:08 -07001709 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001710 } else {
1711 size_t n = nr;
1712 do {
1713 *packed_w++ = 0.0f;
1714 } while (--n != 0);
1715 }
Marat Dukhanab582382020-07-06 13:32:08 -07001716
Marat Dukhana6879bd2020-07-06 14:25:08 -07001717 for (size_t kx = 0; kx < kw; kx++) {
1718 for (size_t c = 0; c < kc; c++) {
1719 for (size_t ky = 0; ky < kh; ky++) {
1720 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1721 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
Marat Dukhanab582382020-07-06 13:32:08 -07001722 }
Marat Dukhanab582382020-07-06 13:32:08 -07001723 }
1724 }
1725 }
Marat Dukhanab582382020-07-06 13:32:08 -07001726 if XNN_UNPREDICTABLE(b != NULL) {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001727 b += nr;
Marat Dukhanab582382020-07-06 13:32:08 -07001728 }
1729 }
1730}
1731
1732void xnn_pack_f16_dconv_oki_w(
1733 size_t nc,
1734 size_t kc,
1735 size_t nr,
1736 size_t kh,
1737 size_t kw,
1738 const uint16_t* k,
1739 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001740 uint16_t* packed_w,
1741 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001742{
1743 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1744 const size_t nr_block_size = min(nc - nr_block_start, nr);
1745 if XNN_LIKELY(b != NULL) {
1746 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1747 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1748 }
1749 } else {
1750 size_t n = nr;
1751 do {
1752 *packed_w++ = 0;
1753 } while (--n != 0);
1754 }
1755
1756 for (size_t kx = 0; kx < kw; kx++) {
1757 for (size_t c = 0; c < kc; c++) {
1758 for (size_t ky = 0; ky < kh; ky++) {
1759 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1760 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1761 }
1762 }
1763 }
1764 }
1765 if XNN_UNPREDICTABLE(b != NULL) {
1766 b += nr;
1767 }
1768 }
1769}
1770
Marat Dukhana6879bd2020-07-06 14:25:08 -07001771void xnn_pack_f32_chw_dwconv_ghw_w(
1772 size_t kernel_size,
1773 size_t groups,
1774 const float* kernel,
1775 const float* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001776 float* packed_weights,
1777 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001778{
Marat Dukhana6879bd2020-07-06 14:25:08 -07001779 for (size_t g = 0; g < groups; g++) {
1780 if XNN_LIKELY(bias != NULL) {
1781 *packed_weights = *bias++;
Marat Dukhanab582382020-07-06 13:32:08 -07001782 } else {
Marat Dukhana6879bd2020-07-06 14:25:08 -07001783 *packed_weights = 0.0f;
Marat Dukhanab582382020-07-06 13:32:08 -07001784 }
Marat Dukhana6879bd2020-07-06 14:25:08 -07001785 packed_weights += 1;
1786 for (size_t i = 0; i < kernel_size; i++) {
1787 *packed_weights++ = kernel[g * kernel_size + i];
Marat Dukhanab582382020-07-06 13:32:08 -07001788 }
1789 }
1790}
1791
1792void xnn_pack_f16_chw_dwconv_ghw_w(
1793 size_t kernel_size,
1794 size_t groups,
1795 const uint16_t* kernel,
1796 const uint16_t* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001797 uint16_t* packed_weights,
1798 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001799{
1800 for (size_t g = 0; g < groups; g++) {
1801 if XNN_LIKELY(bias != NULL) {
1802 *packed_weights = *bias++;
1803 } else {
1804 *packed_weights = 0;
1805 }
1806 packed_weights += 1;
1807 for (size_t i = 0; i < kernel_size; i++) {
1808 *packed_weights++ = kernel[g * kernel_size + i];
1809 }
1810 }
1811}
1812
Marat Dukhanab582382020-07-06 13:32:08 -07001813void xnn_pack_f32_chw_dwconv_hwg_w(
1814 size_t kernel_size,
1815 size_t groups,
1816 const float* kernel,
1817 const float* bias,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001818 float* packed_weights,
1819 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001820{
1821 for (size_t g = 0; g < groups; g++) {
1822 if XNN_LIKELY(bias != NULL) {
1823 *packed_weights = *bias++;
1824 } else {
1825 *packed_weights = 0.0f;
1826 }
1827 packed_weights += 1;
1828 for (size_t i = 0; i < kernel_size; i++) {
1829 *packed_weights++ = kernel[i * groups + g];
1830 }
1831 }
1832}
1833
1834void xnn_pack_f32_vmulcaddc_w(
1835 size_t c,
1836 size_t cr,
1837 const float* s,
1838 const float* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001839 float* packed_w,
1840 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001841{
1842 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1843 const size_t cr_block_size = min(c - cr_block_start, cr);
1844 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1845 *packed_w++ = s[cr_block_start + cr_block_offset];
1846 }
1847 packed_w += cr - cr_block_size;
1848 if XNN_LIKELY(b != NULL) {
1849 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1850 *packed_w++ = b[cr_block_start + cr_block_offset];
1851 }
1852 } else {
1853 size_t n = cr_block_size;
1854 do {
1855 *packed_w++ = 0.0f;
1856 } while (--n != 0);
1857 }
1858 packed_w += cr - cr_block_size;
1859 }
1860}
1861
1862void xnn_pack_f16_vmulcaddc_w(
1863 size_t c,
1864 size_t cr,
1865 const uint16_t* s,
1866 const uint16_t* b,
Marat Dukhanb42f8662020-07-06 20:46:13 -07001867 uint16_t* packed_w,
1868 const void* params)
Marat Dukhanab582382020-07-06 13:32:08 -07001869{
1870 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1871 const size_t cr_block_size = min(c - cr_block_start, cr);
1872 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1873 *packed_w++ = s[cr_block_start + cr_block_offset];
1874 }
1875 packed_w += cr - cr_block_size;
1876 if XNN_LIKELY(b != NULL) {
1877 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1878 *packed_w++ = b[cr_block_start + cr_block_offset];
1879 }
1880 } else {
1881 size_t n = cr_block_size;
1882 do {
1883 *packed_w++ = 0;
1884 } while (--n != 0);
1885 }
1886 packed_w += cr - cr_block_size;
1887 }
1888}