blob: 05354350f99efd9e038f9f9a476c6ac518d17c3c [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#pragma once
10
11#include <stdint.h>
12#include <xnnpack/math.h>
13#include <xnnpack/operator.h>
14
15
16static inline void xnn_pack_q8_gemm_goi_w(
17 size_t g,
18 size_t nc,
19 size_t kc,
20 uint32_t nr,
21 uint32_t kr,
22 uint8_t izp,
23 uint8_t kzp,
24 const uint8_t* k,
25 const int32_t* b,
26 void* packed_w)
27{
28 const int32_t boff = (int32_t) kc * (int32_t) izp * (int32_t) kzp;
29 do {
30 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
31 const size_t nr_block_size = min(nc - nr_block_start, nr);
32 int32_t* packed_b = (int32_t*) packed_w;
Marat Dukhanf568f082019-10-30 09:47:07 -070033 if XNN_LIKELY(b != NULL) {
34 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
35 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
36 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
37 }
38 } else {
39 size_t n = nr_block_size;
40 do {
41 *((int32_t*) packed_w) = boff;
42 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
43 } while (--n != 0);
XNNPACK Teamb455b122019-09-27 18:10:33 -070044 }
45 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
46 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
47 const size_t kr_block_size = min(kc - kr_block_start, kr);
48 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
49 int32_t ksum = 0;
50 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
51 const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
52 ksum += (int32_t) kv;
53 *((uint8_t*) packed_w) = kv;
54 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
55 }
56 packed_b[nr_block_offset] -= ksum * (int32_t) izp;
57 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
58 }
59 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
60 }
61 }
62 k += nc * kc;
Marat Dukhanf568f082019-10-30 09:47:07 -070063 if XNN_UNPREDICTABLE(b != NULL) {
64 b += nc;
65 }
XNNPACK Teamb455b122019-09-27 18:10:33 -070066 } while (--g != 0);
67}
68
Marat Dukhanc4f0ff92019-12-03 14:59:08 -080069static inline void xnn_pack_q8_gemm_io_w(
70 size_t nc,
71 size_t kc,
72 uint32_t nr,
73 uint32_t kr,
74 uint8_t izp,
75 uint8_t kzp,
76 const uint8_t* k,
77 const int32_t* b,
78 void* packed_w)
79{
80 const int32_t boff = (int32_t) kc * (int32_t) izp * (int32_t) kzp;
81 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
82 const size_t nr_block_size = min(nc - nr_block_start, nr);
83 int32_t* packed_b = (int32_t*) packed_w;
84 if XNN_LIKELY(b != NULL) {
85 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
86 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
87 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
88 }
89 } else {
90 size_t n = nr_block_size;
91 do {
92 *((int32_t*) packed_w) = boff;
93 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
94 } while (--n != 0);
95 }
96 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
97 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
98 const size_t kr_block_size = min(kc - kr_block_start, kr);
99 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
100 int32_t ksum = 0;
101 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
102 const uint8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
103 ksum += (int32_t) kv;
104 *((uint8_t*) packed_w) = kv;
105 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
106 }
107 packed_b[nr_block_offset] -= ksum * (int32_t) izp;
108 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
109 }
110 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
111 }
112 }
113}
114
XNNPACK Teamb455b122019-09-27 18:10:33 -0700115static inline void xnn_pack_q8_conv_goki_w(
116 size_t g,
117 size_t nc,
118 size_t ks,
119 size_t kc,
120 uint32_t nr,
121 uint32_t kr,
122 uint8_t izp,
123 uint8_t kzp,
124 const uint8_t* k,
125 const int32_t* b,
126 void* packed_w)
127{
128 const int32_t boff = (int32_t) ks * (int32_t) kc * (int32_t) izp * (int32_t) kzp;
129 do {
130 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
131 const size_t nr_block_size = min(nc - nr_block_start, nr);
132 int32_t* packed_b = (int32_t*) packed_w;
Marat Dukhanf568f082019-10-30 09:47:07 -0700133 if XNN_LIKELY(b != NULL) {
134 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
135 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
136 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
137 }
138 } else {
139 size_t n = nr_block_size;
140 do {
141 *((int32_t*) packed_w) = boff;
142 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
143 } while (--n != 0);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700144 }
145 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
146 for (size_t ki = 0; ki < ks; ki++) {
147 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
148 const size_t kr_block_size = min(kc - kr_block_start, kr);
149 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
150 int32_t ksum = 0;
151 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
152 const uint8_t kv =
153 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
154 ksum += (int32_t) kv;
155 *((uint8_t*) packed_w) = kv;
156 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
157 }
158 packed_b[nr_block_offset] -= ksum * (int32_t) izp;
159 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
160 }
161 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
162 }
163 }
164 }
165 k += ks * kc * nc;
Marat Dukhanf568f082019-10-30 09:47:07 -0700166 if XNN_UNPREDICTABLE(b != NULL) {
167 b += nc;
168 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700169 } while (--g != 0);
170}
171
172static inline void xnn_pack_q8_conv_kgo_w(
173 size_t g,
174 size_t nc,
175 size_t ks,
176 uint32_t nr,
177 uint32_t kr,
178 uint8_t izp,
179 uint8_t kzp,
180 const uint8_t* k,
181 const int32_t* b,
182 void* packed_w)
183{
184 const int32_t boff = (int32_t) ks * (int32_t) izp * (int32_t) kzp;
185 for (size_t i = 0; i < g; i++) {
186 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
187 const size_t nr_block_size = min(nc - nr_block_start, nr);
188 int32_t* packed_b = (int32_t*) packed_w;
Marat Dukhanf568f082019-10-30 09:47:07 -0700189 if XNN_LIKELY(b != NULL) {
190 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
191 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
192 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
193 }
194 } else {
195 size_t n = nr_block_size;
196 do {
197 *((int32_t*) packed_w) = boff;
198 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
199 } while (--n != 0);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700200 }
201 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
202 for (size_t ki = 0; ki < ks; ki++) {
203 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
204 const uint8_t kv =
205 k[ki * g * nc + (nr_block_start + nr_block_offset)];
206 *((uint8_t*) packed_w) = kv;
207 packed_b[nr_block_offset] -= (int32_t) kv * (int32_t) izp;
208 packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(uint8_t));
209 }
210 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
211 }
212 }
213 k += nc;
Marat Dukhanf568f082019-10-30 09:47:07 -0700214 if XNN_UNPREDICTABLE(b != NULL) {
215 b += nc;
216 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700217 }
218}
219
220static inline void xnn_pack_q8_deconv_goki_w(
221 size_t g,
222 size_t nc,
223 size_t kh,
224 size_t kw,
225 size_t kc,
226 size_t sh,
227 size_t sw,
228 size_t nr,
229 size_t kr,
230 uint8_t izp,
231 uint8_t kzp,
232 const uint8_t* k,
233 const int32_t* b,
234 void* packed_w,
235 struct subconvolution_params* params)
236{
237 for (size_t i = 0; i < g; i++) {
238 for (size_t oy = 0; oy < sh; oy++) {
239 for (size_t ox = 0; ox < sw; ox++) {
240 if (i == 0) {
241 (*params++).weights = packed_w;
242 }
243 const int32_t boff = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * (int32_t) izp * (int32_t) kzp;
244 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
245 const size_t nr_block_size = min(nc - nr_block_start, nr);
246 int32_t* packed_b = (int32_t*) packed_w;
Marat Dukhanf568f082019-10-30 09:47:07 -0700247 if XNN_LIKELY(b != 0) {
248 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
249 *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff;
250 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
251 }
252 } else {
253 size_t n = nr_block_size;
254 do {
255 *((int32_t*) packed_w) = boff;
256 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
257 } while (--n != 0);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700258 }
259 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
260 for (size_t ky = oy; ky < kh; ky += sh) {
261 for (size_t kx = ox; kx < kw; kx += sw) {
262 for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
263 const size_t kr_block_size = min(kc - kr_block_start, kr);
264 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
265 int32_t ksum = 0;
266 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
267 const uint8_t kv =
268 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
269 ksum += (int32_t) kv;
270 *((uint8_t*) packed_w) = kv;
271 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
272 }
273 packed_b[nr_block_offset] -= ksum * (int32_t) izp;
274 packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t));
275 }
276 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t));
277 }
278 }
279 }
280 }
281 }
282 }
283 k += kh * kw * kc * nc;
Marat Dukhanf568f082019-10-30 09:47:07 -0700284 if XNN_UNPREDICTABLE(b != NULL) {
285 b += nc;
286 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700287 }
288}
289
290static inline void xnn_pack_q8_dwconv_ghw_w(
291 size_t h,
292 size_t w,
293 size_t c,
294 size_t cr,
295 uint8_t izp,
296 uint8_t kzp,
297 const uint8_t* k,
298 const int32_t* b,
299 void* packed_w)
300{
301 const int32_t boff = (int32_t) h * (int32_t) w * (int32_t) izp * (int32_t) kzp;
302 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
303 const size_t cr_block_size = min(c - cr_block_start, cr);
304 int32_t* packed_b = (int32_t*) packed_w;
Marat Dukhanf568f082019-10-30 09:47:07 -0700305 if XNN_LIKELY(b != NULL) {
306 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
307 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
308 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
309 }
310 } else {
311 size_t n = cr_block_size;
312 do {
313 *((int32_t*) packed_w) = boff;
314 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
315 } while (--n != 0);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700316 }
317 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
318 for (size_t x = 0; x < w; x++) {
319 for (size_t y = 0; y < h; y++) {
320 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
321 const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
322 packed_b[cr_block_offset] -= (int32_t) kv * (int32_t) izp;
323 *((uint8_t*) packed_w) = kv;
324 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
325 }
326 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
327 }
328 }
329 }
330}
331
332static inline void xnn_pack_q8_dwconv_hwg_w(
333 size_t h,
334 size_t w,
335 size_t c,
336 size_t cr,
337 uint8_t izp,
338 uint8_t kzp,
339 const uint8_t* k,
340 const int32_t* b,
341 void* packed_w)
342{
343 const int32_t boff = (int32_t) h * (int32_t) w * (int32_t) izp * (int32_t) kzp;
344 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
345 const size_t cr_block_size = min(c - cr_block_start, cr);
346 int32_t* packed_b = (int32_t*) packed_w;
Marat Dukhanf568f082019-10-30 09:47:07 -0700347 if XNN_LIKELY(b != NULL) {
348 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
349 *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff;
350 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
351 }
352 } else {
353 size_t n = cr_block_size;
354 do {
355 *((int32_t*) packed_w) = boff;
356 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
357 } while (--n != 0);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700358 }
359 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
360 for (size_t x = 0; x < w; x++) {
361 for (size_t y = 0; y < h; y++) {
362 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
363 const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
364 packed_b[cr_block_offset] -= (int32_t) kv * (int32_t) izp;
365 *((uint8_t*) packed_w) = kv;
366 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
367 }
368 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
369 }
370 }
371 }
372}
373
374static inline void xnn_pack_f16_gemm_goi_w(
375 size_t g,
376 size_t nc,
377 size_t kc,
378 size_t nr,
379 size_t kr,
Frank Barchard142268b2020-04-29 16:37:18 -0700380 size_t sr,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700381 const uint16_t* k,
382 const uint16_t* b,
383 uint16_t* packed_w)
384{
Frank Barchard142268b2020-04-29 16:37:18 -0700385 const size_t skr = sr * kr;
386 const size_t skc = round_down_po2(kc, skr);
387 const size_t sr_mask = (sr - 1) * kr;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700388 do {
389 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
390 const size_t nr_block_size = min(nc - nr_block_start, nr);
Marat Dukhanf568f082019-10-30 09:47:07 -0700391 if XNN_LIKELY(b != NULL) {
392 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
393 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
394 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700395 }
Marat Dukhanf568f082019-10-30 09:47:07 -0700396 packed_w += nr;
Frank Barchard142268b2020-04-29 16:37:18 -0700397
398 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
399 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
400 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
401 *packed_w++ =
402 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
403 }
404 }
405 packed_w += (nr - nr_block_size) * kr;
406 }
407
408 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700409 const size_t kr_block_size = min(kc - kr_block_start, kr);
410 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
411 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
412 *packed_w++ =
413 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
414 }
415 packed_w += kr - kr_block_size;
416 }
417 packed_w += (nr - nr_block_size) * kr;
418 }
419 }
420 k += nc * kc;
Marat Dukhanf568f082019-10-30 09:47:07 -0700421 if XNN_UNPREDICTABLE(b != NULL) {
422 b += nc;
423 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700424 } while (--g != 0);
425}
426
Marat Dukhanc4f0ff92019-12-03 14:59:08 -0800427static inline void xnn_pack_f16_gemm_io_w(
428 size_t nc,
429 size_t kc,
430 size_t nr,
431 size_t kr,
Frank Barchard142268b2020-04-29 16:37:18 -0700432 size_t sr,
Marat Dukhanc4f0ff92019-12-03 14:59:08 -0800433 const uint16_t* k,
434 const uint16_t* b,
435 uint16_t* packed_w)
436{
Frank Barchard142268b2020-04-29 16:37:18 -0700437 const size_t skr = sr * kr;
438 const size_t skc = round_down_po2(kc, skr);
439 const size_t sr_mask = (sr - 1) * kr;
Marat Dukhanc4f0ff92019-12-03 14:59:08 -0800440 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
441 const size_t nr_block_size = min(nc - nr_block_start, nr);
442 if XNN_LIKELY(b != NULL) {
443 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
444 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
445 }
446 }
447 packed_w += nr;
Frank Barchard142268b2020-04-29 16:37:18 -0700448
449 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
450 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
451 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
452 *packed_w++ =
453 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
454 }
455 }
456 packed_w += (nr - nr_block_size) * kr;
457 }
458
459 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
Marat Dukhanc4f0ff92019-12-03 14:59:08 -0800460 const size_t kr_block_size = min(kc - kr_block_start, kr);
461 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
462 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
463 *packed_w++ =
464 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
465 }
466 packed_w += kr - kr_block_size;
467 }
468 packed_w += (nr - nr_block_size) * kr;
469 }
470 }
471}
472
Frank Barchard142268b2020-04-29 16:37:18 -0700473static inline void xnn_pack_f16_gemminc_goi_w(
474 size_t g,
475 size_t nc,
476 size_t kc,
477 size_t nr,
478 size_t kr,
479 size_t sr,
480 const uint16_t* k,
481 uint16_t* packed_w)
482{
483 const size_t skr = sr * kr;
484 const size_t skc = round_down_po2(kc, skr);
485 const size_t sr_mask = (sr - 1) * kr;
486 do {
487 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
488 const size_t nr_block_size = min(nc - nr_block_start, nr);
489
490 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
491 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
492 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
493 *packed_w++ =
494 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
495 }
496 }
497 packed_w += (nr - nr_block_size) * kr;
498 }
499
500 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
501 const size_t kr_block_size = min(kc - kr_block_start, kr);
502 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
503 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
504 *packed_w++ =
505 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
506 }
507 packed_w += kr - kr_block_size;
508 }
509 packed_w += (nr - nr_block_size) * kr;
510 }
511 }
512 k += nc * kc;
513 } while (--g != 0);
514}
515
516static inline void xnn_pack_f16_conv_goki_w(
517 size_t g,
518 size_t nc,
519 size_t ks,
520 size_t kc,
521 size_t nr,
522 size_t kr,
523 size_t sr,
524 const uint16_t* k,
525 const uint16_t* b,
526 uint16_t* packed_w)
527{
528 const size_t skr = sr * kr;
529 const size_t skc = round_down_po2(kc, skr);
530 const size_t sr_mask = (sr - 1) * kr;
531 do {
532 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
533 const size_t nr_block_size = min(nc - nr_block_start, nr);
534 if XNN_LIKELY(b != NULL) {
535 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
536 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
537 }
538 }
539 packed_w += nr;
540
541 for (size_t ki = 0; ki < ks; ki++) {
542 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
543 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
544 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
545 *packed_w++ =
546 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
547 }
548 }
549 packed_w += (nr - nr_block_size) * kr;
550 }
551
552 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
553 const size_t kr_block_size = min(kc - kr_block_start, kr);
554 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
555 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
556 *packed_w++ =
557 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
558 }
559 packed_w += kr - kr_block_size;
560 }
561 packed_w += (nr - nr_block_size) * kr;
562 }
563 }
564 }
565 k += ks * kc * nc;
566 if XNN_UNPREDICTABLE(b != NULL) {
567 b += nc;
568 }
569 } while (--g != 0);
570}
571
572static inline void xnn_pack_f16_conv_kgo_w(
573 size_t g,
574 size_t nc,
575 size_t ks,
576 size_t nr,
577 size_t kr,
578 const uint16_t* k,
579 const uint16_t* b,
580 uint16_t* packed_w)
581{
582 for (size_t i = 0; i < g; i++) {
583 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
584 const size_t nr_block_size = min(nc - nr_block_start, nr);
585 if XNN_LIKELY(b != NULL) {
586 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
587 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
588 }
589 }
590 packed_w += nr;
591 for (size_t ki = 0; ki < ks; ki++) {
592 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
593 *packed_w =
594 k[ki * g * nc + (nr_block_start + nr_block_offset)];
595 packed_w += kr;
596 }
597 packed_w += (nr - nr_block_size) * kr;
598 }
599 }
600 k += nc;
601 if XNN_UNPREDICTABLE(b != NULL) {
602 b += nc;
603 }
604 }
605}
606
607static inline void xnn_pack_f16_dconv_oki_w(
608 size_t nc,
609 size_t kc,
610 size_t nr,
611 size_t kh,
612 size_t kw,
613 const uint16_t* k,
614 const uint16_t* b,
615 uint16_t* packed_w)
616{
617 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
618 const size_t nr_block_size = min(nc - nr_block_start, nr);
619 if XNN_LIKELY(b != NULL) {
620 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
621 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
622 }
623 } else {
624 size_t n = nr;
625 do {
626 *packed_w++ = 0;
627 } while (--n != 0);
628 }
629
630 for (size_t kx = 0; kx < kw; kx++) {
631 for (size_t c = 0; c < kc; c++) {
632 for (size_t ky = 0; ky < kh; ky++) {
633 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
634 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
635 }
636 }
637 }
638 }
639 if XNN_UNPREDICTABLE(b != NULL) {
640 b += nr;
641 }
642 }
643}
644
645static inline void xnn_pack_f16_deconv_goki_w(
646 size_t g,
647 size_t nc,
648 size_t kh,
649 size_t kw,
650 size_t kc,
651 size_t sh,
652 size_t sw,
653 size_t nr,
654 size_t kr,
655 size_t sr,
656 const uint16_t* k,
657 const uint16_t* b,
658 uint16_t* packed_w,
659 struct subconvolution_params* params)
660{
661 const size_t skr = sr * kr;
662 const size_t skc = round_down_po2(kc, skr);
663 const size_t sr_mask = (sr - 1) * kr;
664 for (size_t i = 0; i < g; i++) {
665 for (size_t oy = 0; oy < sh; oy++) {
666 for (size_t ox = 0; ox < sw; ox++) {
667 if (i == 0) {
668 (*params++).weights = packed_w;
669 }
670 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
671 const size_t nr_block_size = min(nc - nr_block_start, nr);
672 if XNN_LIKELY(b != NULL) {
673 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
674 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
675 }
676 }
677 packed_w += nr;
678 for (size_t ky = oy; ky < kh; ky += sh) {
679 for (size_t kx = ox; kx < kw; kx += sw) {
680 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
681 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
682 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
683 *packed_w++ =
684 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
685 }
686 }
687 packed_w += (nr - nr_block_size) * kr;
688 }
689
690 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
691 const size_t kr_block_size = min(kc - kr_block_start, kr);
692 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
693 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
694 *packed_w++ =
695 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
696 }
697 packed_w += kr - kr_block_size;
698 }
699 packed_w += (nr - nr_block_size) * kr;
700 }
701 }
702 }
703 }
704 }
705 }
706 k += kh * kw * kc * nc;
707 if XNN_UNPREDICTABLE(b != NULL) {
708 b += nc;
709 }
710 }
711}
712
713static inline void xnn_pack_f16_dwconv_ghw_w(
714 size_t h,
715 size_t w,
716 size_t c,
717 size_t cr,
718 const uint16_t* k,
719 const uint16_t* b,
720 uint16_t* packed_w)
721{
722 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
723 const size_t cr_block_size = min(c - cr_block_start, cr);
724 if XNN_LIKELY(b != NULL) {
725 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
726 *packed_w++ = b[cr_block_start + cr_block_offset];
727 }
728 } else {
729 size_t n = cr_block_size;
730 do {
731 *packed_w++ = 0;
732 } while (--n != 0);
733 }
734 packed_w += cr - cr_block_size;
735 for (size_t x = 0; x < w; x++) {
736 for (size_t y = 0; y < h; y++) {
737 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
738 const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
739 *packed_w++ = kv;
740 }
741 packed_w += cr - cr_block_size;
742 }
743 }
744 }
745}
746
747static inline void xnn_pack_f16_dwconv_hwg_w(
748 size_t h,
749 size_t w,
750 size_t c,
751 size_t cr,
752 const uint16_t* k,
753 const uint16_t* b,
754 uint16_t* packed_w)
755{
756 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
757 const size_t cr_block_size = min(c - cr_block_start, cr);
758 if XNN_LIKELY(b != NULL) {
759 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
760 *packed_w++ = b[cr_block_start + cr_block_offset];
761 }
762 } else {
763 size_t n = cr_block_size;
764 do {
765 *packed_w++ = 0;
766 } while (--n != 0);
767 }
768 packed_w += cr - cr_block_size;
769 for (size_t x = 0; x < w; x++) {
770 for (size_t y = 0; y < h; y++) {
771 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
772 const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
773 *packed_w++ = kv;
774 }
775 packed_w += cr - cr_block_size;
776 }
777 }
778 }
779}
780
Marat Dukhan1f29b802020-05-15 23:46:39 -0700781static inline void xnn_pack_f16_chw_dwconv_ghw_w(
Frank Barchard142268b2020-04-29 16:37:18 -0700782 size_t kernel_size,
783 size_t groups,
784 const uint16_t* kernel,
785 const uint16_t* bias,
786 uint16_t* packed_weights)
787{
788 for (size_t g = 0; g < groups; g++) {
789 if XNN_LIKELY(bias != NULL) {
790 *packed_weights = *bias++;
791 } else {
792 *packed_weights = 0;
793 }
794 packed_weights += 1;
795 for (size_t i = 0; i < kernel_size; i++) {
796 *packed_weights++ = kernel[g * kernel_size + i];
797 }
798 }
799}
800
XNNPACK Teamb455b122019-09-27 18:10:33 -0700801static inline void xnn_pack_f32_gemm_goi_w(
802 size_t g,
803 size_t nc,
804 size_t kc,
805 size_t nr,
806 size_t kr,
807 size_t sr,
808 const float* k,
809 const float* b,
810 float* packed_w)
811{
812 const size_t skr = sr * kr;
813 const size_t skc = round_down_po2(kc, skr);
814 const size_t sr_mask = (sr - 1) * kr;
815 do {
816 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
817 const size_t nr_block_size = min(nc - nr_block_start, nr);
Marat Dukhanf568f082019-10-30 09:47:07 -0700818 if XNN_LIKELY(b != NULL) {
819 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
820 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
821 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700822 }
Marat Dukhanf568f082019-10-30 09:47:07 -0700823 packed_w += nr;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700824
825 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
826 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
827 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
828 *packed_w++ =
829 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
830 }
831 }
832 packed_w += (nr - nr_block_size) * kr;
833 }
834
835 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
836 const size_t kr_block_size = min(kc - kr_block_start, kr);
837 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
838 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
839 *packed_w++ =
840 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
841 }
842 packed_w += kr - kr_block_size;
843 }
844 packed_w += (nr - nr_block_size) * kr;
845 }
846 }
847 k += nc * kc;
Marat Dukhanf568f082019-10-30 09:47:07 -0700848 if XNN_UNPREDICTABLE(b != NULL) {
849 b += nc;
850 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700851 } while (--g != 0);
852}
853
Marat Dukhanc4f0ff92019-12-03 14:59:08 -0800854static inline void xnn_pack_f32_gemm_io_w(
855 size_t nc,
856 size_t kc,
857 size_t nr,
858 size_t kr,
859 size_t sr,
860 const float* k,
861 const float* b,
862 float* packed_w)
863{
864 const size_t skr = sr * kr;
865 const size_t skc = round_down_po2(kc, skr);
866 const size_t sr_mask = (sr - 1) * kr;
867 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
868 const size_t nr_block_size = min(nc - nr_block_start, nr);
869 if XNN_LIKELY(b != NULL) {
870 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
871 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
872 }
873 }
874 packed_w += nr;
875
876 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
877 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
878 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
879 *packed_w++ =
880 k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
881 }
882 }
883 packed_w += (nr - nr_block_size) * kr;
884 }
885
886 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
887 const size_t kr_block_size = min(kc - kr_block_start, kr);
888 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
889 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
890 *packed_w++ =
891 k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
892 }
893 packed_w += kr - kr_block_size;
894 }
895 packed_w += (nr - nr_block_size) * kr;
896 }
897 }
898}
899
XNNPACK Teamb455b122019-09-27 18:10:33 -0700900static inline void xnn_pack_f32_gemminc_goi_w(
901 size_t g,
902 size_t nc,
903 size_t kc,
904 size_t nr,
905 size_t kr,
906 size_t sr,
907 const float* k,
908 float* packed_w)
909{
910 const size_t skr = sr * kr;
911 const size_t skc = round_down_po2(kc, skr);
912 const size_t sr_mask = (sr - 1) * kr;
913 do {
914 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
915 const size_t nr_block_size = min(nc - nr_block_start, nr);
916
917 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
918 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
919 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
920 *packed_w++ =
921 k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
922 }
923 }
924 packed_w += (nr - nr_block_size) * kr;
925 }
926
927 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
928 const size_t kr_block_size = min(kc - kr_block_start, kr);
929 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
930 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
931 *packed_w++ =
932 k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
933 }
934 packed_w += kr - kr_block_size;
935 }
936 packed_w += (nr - nr_block_size) * kr;
937 }
938 }
939 k += nc * kc;
940 } while (--g != 0);
941}
942
943static inline void xnn_pack_f32_conv_goki_w(
944 size_t g,
945 size_t nc,
946 size_t ks,
947 size_t kc,
948 size_t nr,
949 size_t kr,
950 size_t sr,
951 const float* k,
952 const float* b,
953 float* packed_w)
954{
955 const size_t skr = sr * kr;
956 const size_t skc = round_down_po2(kc, skr);
957 const size_t sr_mask = (sr - 1) * kr;
958 do {
959 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
960 const size_t nr_block_size = min(nc - nr_block_start, nr);
Marat Dukhanf568f082019-10-30 09:47:07 -0700961 if XNN_LIKELY(b != NULL) {
962 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
963 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
964 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700965 }
Marat Dukhanf568f082019-10-30 09:47:07 -0700966 packed_w += nr;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700967
968 for (size_t ki = 0; ki < ks; ki++) {
969 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
970 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
971 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
972 *packed_w++ =
973 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
974 }
975 }
976 packed_w += (nr - nr_block_size) * kr;
977 }
978
979 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
980 const size_t kr_block_size = min(kc - kr_block_start, kr);
981 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
982 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
983 *packed_w++ =
984 k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
985 }
986 packed_w += kr - kr_block_size;
987 }
988 packed_w += (nr - nr_block_size) * kr;
989 }
990 }
991 }
992 k += ks * kc * nc;
Marat Dukhanf568f082019-10-30 09:47:07 -0700993 if XNN_UNPREDICTABLE(b != NULL) {
994 b += nc;
995 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700996 } while (--g != 0);
997}
998
999static inline void xnn_pack_f32_conv_kgo_w(
1000 size_t g,
1001 size_t nc,
1002 size_t ks,
1003 size_t nr,
1004 size_t kr,
1005 const float* k,
1006 const float* b,
1007 float* packed_w)
1008{
1009 for (size_t i = 0; i < g; i++) {
1010 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1011 const size_t nr_block_size = min(nc - nr_block_start, nr);
Marat Dukhanf568f082019-10-30 09:47:07 -07001012 if XNN_LIKELY(b != NULL) {
1013 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1014 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
1015 }
XNNPACK Teamb455b122019-09-27 18:10:33 -07001016 }
Marat Dukhanf568f082019-10-30 09:47:07 -07001017 packed_w += nr;
XNNPACK Teamb455b122019-09-27 18:10:33 -07001018 for (size_t ki = 0; ki < ks; ki++) {
1019 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1020 *packed_w =
1021 k[ki * g * nc + (nr_block_start + nr_block_offset)];
1022 packed_w += kr;
1023 }
1024 packed_w += (nr - nr_block_size) * kr;
1025 }
1026 }
1027 k += nc;
Marat Dukhanf568f082019-10-30 09:47:07 -07001028 if XNN_UNPREDICTABLE(b != NULL) {
1029 b += nc;
1030 }
XNNPACK Teamb455b122019-09-27 18:10:33 -07001031 }
1032}
1033
1034static inline void xnn_pack_f32_dconv_oki_w(
1035 size_t nc,
1036 size_t kc,
1037 size_t nr,
1038 size_t kh,
1039 size_t kw,
1040 const float* k,
1041 const float* b,
1042 float* packed_w)
1043{
1044 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1045 const size_t nr_block_size = min(nc - nr_block_start, nr);
Marat Dukhanf568f082019-10-30 09:47:07 -07001046 if XNN_LIKELY(b != NULL) {
1047 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1048 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1049 }
1050 } else {
1051 size_t n = nr;
1052 do {
1053 *packed_w++ = 0.0f;
1054 } while (--n != 0);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001055 }
1056
1057 for (size_t kx = 0; kx < kw; kx++) {
1058 for (size_t c = 0; c < kc; c++) {
1059 for (size_t ky = 0; ky < kh; ky++) {
1060 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1061 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1062 }
1063 }
1064 }
1065 }
Marat Dukhanf568f082019-10-30 09:47:07 -07001066 if XNN_UNPREDICTABLE(b != NULL) {
1067 b += nr;
1068 }
XNNPACK Teamb455b122019-09-27 18:10:33 -07001069 }
1070}
1071
1072static inline void xnn_pack_f32_deconv_goki_w(
1073 size_t g,
1074 size_t nc,
1075 size_t kh,
1076 size_t kw,
1077 size_t kc,
1078 size_t sh,
1079 size_t sw,
1080 size_t nr,
1081 size_t kr,
Marat Dukhanc4ae7de2019-10-25 02:06:26 -07001082 size_t sr,
XNNPACK Teamb455b122019-09-27 18:10:33 -07001083 const float* k,
1084 const float* b,
1085 float* packed_w,
1086 struct subconvolution_params* params)
1087{
Marat Dukhanc4ae7de2019-10-25 02:06:26 -07001088 const size_t skr = sr * kr;
1089 const size_t skc = round_down_po2(kc, skr);
1090 const size_t sr_mask = (sr - 1) * kr;
XNNPACK Teamb455b122019-09-27 18:10:33 -07001091 for (size_t i = 0; i < g; i++) {
1092 for (size_t oy = 0; oy < sh; oy++) {
1093 for (size_t ox = 0; ox < sw; ox++) {
1094 if (i == 0) {
1095 (*params++).weights = packed_w;
1096 }
1097 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1098 const size_t nr_block_size = min(nc - nr_block_start, nr);
Marat Dukhanf568f082019-10-30 09:47:07 -07001099 if XNN_LIKELY(b != NULL) {
1100 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1101 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
1102 }
XNNPACK Teamb455b122019-09-27 18:10:33 -07001103 }
Marat Dukhanf568f082019-10-30 09:47:07 -07001104 packed_w += nr;
XNNPACK Teamb455b122019-09-27 18:10:33 -07001105 for (size_t ky = oy; ky < kh; ky += sh) {
1106 for (size_t kx = ox; kx < kw; kx += sw) {
Marat Dukhanc4ae7de2019-10-25 02:06:26 -07001107 for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
1108 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1109 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1110 *packed_w++ =
1111 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
1112 }
1113 }
1114 packed_w += (nr - nr_block_size) * kr;
1115 }
1116
1117 for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001118 const size_t kr_block_size = min(kc - kr_block_start, kr);
1119 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1120 for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
1121 *packed_w++ =
1122 k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
1123 }
1124 packed_w += kr - kr_block_size;
1125 }
1126 packed_w += (nr - nr_block_size) * kr;
1127 }
1128 }
1129 }
1130 }
1131 }
1132 }
1133 k += kh * kw * kc * nc;
Marat Dukhanf568f082019-10-30 09:47:07 -07001134 if XNN_UNPREDICTABLE(b != NULL) {
1135 b += nc;
1136 }
XNNPACK Teamb455b122019-09-27 18:10:33 -07001137 }
1138}
1139
1140static inline void xnn_pack_f32_dwconv_ghw_w(
1141 size_t h,
1142 size_t w,
1143 size_t c,
1144 size_t cr,
1145 const float* k,
1146 const float* b,
1147 float* packed_w)
1148{
1149 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1150 const size_t cr_block_size = min(c - cr_block_start, cr);
Marat Dukhanf568f082019-10-30 09:47:07 -07001151 if XNN_LIKELY(b != NULL) {
1152 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1153 *packed_w++ = b[cr_block_start + cr_block_offset];
1154 }
1155 } else {
1156 size_t n = cr_block_size;
1157 do {
1158 *packed_w++ = 0.0f;
1159 } while (--n != 0);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001160 }
1161 packed_w += cr - cr_block_size;
1162 for (size_t x = 0; x < w; x++) {
1163 for (size_t y = 0; y < h; y++) {
1164 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1165 const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1166 *packed_w++ = kv;
1167 }
1168 packed_w += cr - cr_block_size;
1169 }
1170 }
1171 }
1172}
1173
1174static inline void xnn_pack_f32_dwconv_hwg_w(
1175 size_t h,
1176 size_t w,
1177 size_t c,
1178 size_t cr,
1179 const float* k,
1180 const float* b,
1181 float* packed_w)
1182{
1183 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1184 const size_t cr_block_size = min(c - cr_block_start, cr);
Marat Dukhanf568f082019-10-30 09:47:07 -07001185 if XNN_LIKELY(b != NULL) {
1186 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1187 *packed_w++ = b[cr_block_start + cr_block_offset];
1188 }
1189 } else {
1190 size_t n = cr_block_size;
1191 do {
1192 *packed_w++ = 0.0f;
1193 } while (--n != 0);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001194 }
1195 packed_w += cr - cr_block_size;
1196 for (size_t x = 0; x < w; x++) {
1197 for (size_t y = 0; y < h; y++) {
1198 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1199 const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1200 *packed_w++ = kv;
1201 }
1202 packed_w += cr - cr_block_size;
1203 }
1204 }
1205 }
1206}
1207
Marat Dukhan1f29b802020-05-15 23:46:39 -07001208static inline void xnn_pack_f32_chw_dwconv_ghw_w(
XNNPACK Teamb455b122019-09-27 18:10:33 -07001209 size_t kernel_size,
1210 size_t groups,
1211 const float* kernel,
1212 const float* bias,
1213 float* packed_weights)
1214{
1215 for (size_t g = 0; g < groups; g++) {
Marat Dukhanf568f082019-10-30 09:47:07 -07001216 if XNN_LIKELY(bias != NULL) {
1217 *packed_weights = *bias++;
1218 } else {
1219 *packed_weights = 0.0f;
1220 }
1221 packed_weights += 1;
XNNPACK Teamb455b122019-09-27 18:10:33 -07001222 for (size_t i = 0; i < kernel_size; i++) {
1223 *packed_weights++ = kernel[g * kernel_size + i];
1224 }
1225 }
1226}
1227
1228static inline void xnn_pack_f32_vmulcaddc_w(
1229 size_t c,
1230 size_t cr,
1231 const float* s,
1232 const float* b,
1233 float* packed_w)
1234{
1235 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1236 const size_t cr_block_size = min(c - cr_block_start, cr);
1237 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
Marat Dukhanf568f082019-10-30 09:47:07 -07001238 *packed_w++ = s[cr_block_start + cr_block_offset];
XNNPACK Teamb455b122019-09-27 18:10:33 -07001239 }
Marat Dukhanf568f082019-10-30 09:47:07 -07001240 packed_w += cr - cr_block_size;
1241 if XNN_LIKELY(b != NULL) {
1242 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1243 *packed_w++ = b[cr_block_start + cr_block_offset];
1244 }
1245 } else {
1246 size_t n = cr_block_size;
1247 do {
1248 *packed_w++ = 0.0f;
1249 } while (--n != 0);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001250 }
Marat Dukhanf568f082019-10-30 09:47:07 -07001251 packed_w += cr - cr_block_size;
XNNPACK Teamb455b122019-09-27 18:10:33 -07001252 }
1253}