| // Copyright (c) Facebook, Inc. and its affiliates. |
| // All rights reserved. |
| // |
| // Copyright 2019 Google LLC |
| // |
| // This source code is licensed under the BSD-style license found in the |
| // LICENSE file in the root directory of this source tree. |
| |
| #include <stdint.h> |
| #include <stddef.h> |
| |
| #include <xnnpack/math.h> |
| #include <xnnpack/pack.h> |
| |
| |
| void xnn_pack_f32_gemm_goi_w( |
| size_t g, |
| size_t nc, |
| size_t kc, |
| size_t nr, |
| size_t kr, |
| size_t sr, |
| const float* k, |
| const float* b, |
| float* packed_w, |
| size_t extra_bytes, |
| const void* params) |
| { |
| const size_t skr = sr * kr; |
| const size_t skc = round_down_po2(kc, skr); |
| const size_t sr_mask = (sr - 1) * kr; |
| do { |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| if XNN_LIKELY(b != NULL) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset]; |
| } |
| } |
| packed_w += nr; |
| |
| for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) { |
| *packed_w++ = |
| k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset]; |
| } |
| } |
| packed_w += (nr - nr_block_size) * kr; |
| } |
| |
| for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) { |
| const size_t kr_block_size = min(kc - kr_block_start, kr); |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) { |
| *packed_w++ = |
| k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)]; |
| } |
| packed_w += kr - kr_block_size; |
| } |
| packed_w += (nr - nr_block_size) * kr; |
| } |
| packed_w = (float*) ((uintptr_t) packed_w + extra_bytes); |
| } |
| k += nc * kc; |
| if XNN_UNPREDICTABLE(b != NULL) { |
| b += nc; |
| } |
| } while (--g != 0); |
| } |
| |
| void xnn_pack_f16_gemm_goi_w( |
| size_t g, |
| size_t nc, |
| size_t kc, |
| size_t nr, |
| size_t kr, |
| size_t sr, |
| const uint16_t* k, |
| const uint16_t* b, |
| uint16_t* packed_w, |
| size_t extra_bytes, |
| const void* params) |
| { |
| const size_t skr = sr * kr; |
| const size_t skc = round_down_po2(kc, skr); |
| const size_t sr_mask = (sr - 1) * kr; |
| do { |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| if XNN_LIKELY(b != NULL) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset]; |
| } |
| } |
| packed_w += nr; |
| |
| for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) { |
| *packed_w++ = |
| k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset]; |
| } |
| } |
| packed_w += (nr - nr_block_size) * kr; |
| } |
| |
| for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) { |
| const size_t kr_block_size = min(kc - kr_block_start, kr); |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) { |
| *packed_w++ = |
| k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)]; |
| } |
| packed_w += kr - kr_block_size; |
| } |
| packed_w += (nr - nr_block_size) * kr; |
| } |
| packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes); |
| } |
| k += nc * kc; |
| if XNN_UNPREDICTABLE(b != NULL) { |
| b += nc; |
| } |
| } while (--g != 0); |
| } |
| |
| void xnn_pack_qu8_gemm_goi_w( |
| size_t g, |
| size_t nc, |
| size_t kc, |
| size_t nr, |
| size_t kr, |
| size_t sr, |
| const uint8_t* k, |
| const int32_t* b, |
| void* packed_w, |
| size_t extra_bytes, |
| const struct xnn_qu8_packing_params* params) |
| { |
| assert(sr == 1); |
| const int32_t izp = (int32_t) params->input_zero_point; |
| const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point; |
| do { |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| int32_t* packed_b = (int32_t*) packed_w; |
| if XNN_LIKELY(b != NULL) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } |
| } else { |
| size_t n = nr_block_size; |
| do { |
| *((int32_t*) packed_w) = boff; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } while (--n != 0); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t)); |
| for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) { |
| const size_t kr_block_size = min(kc - kr_block_start, kr); |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| int32_t ksum = 0; |
| for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) { |
| const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)]; |
| ksum += (int32_t) kv; |
| *((uint8_t*) packed_w) = kv; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t)); |
| } |
| packed_b[nr_block_offset] -= ksum * izp; |
| packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t)); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t)); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + extra_bytes); |
| } |
| k += nc * kc; |
| if XNN_UNPREDICTABLE(b != NULL) { |
| b += nc; |
| } |
| } while (--g != 0); |
| } |
| |
| void xnn_pack_qs8_gemm_goi_w( |
| size_t g, |
| size_t nc, |
| size_t kc, |
| size_t nr, |
| size_t kr, |
| size_t sr, |
| const int8_t* k, |
| const int32_t* b, |
| void* packed_w, |
| size_t extra_bytes, |
| const struct xnn_qs8_packing_params* params) |
| { |
| assert(sr == 1); |
| const int32_t izp = (int32_t) params->input_zero_point; |
| do { |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| int32_t* packed_b = (int32_t*) packed_w; |
| if XNN_LIKELY(b != NULL) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset]; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } |
| } else { |
| size_t n = nr_block_size; |
| do { |
| *((int32_t*) packed_w) = 0; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } while (--n != 0); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t)); |
| for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) { |
| const size_t kr_block_size = min(kc - kr_block_start, kr); |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| int32_t ksum = 0; |
| for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) { |
| const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)]; |
| ksum += (int32_t) kv; |
| *((int8_t*) packed_w) = kv; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t)); |
| } |
| packed_b[nr_block_offset] -= ksum * izp; |
| packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t)); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t)); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + extra_bytes); |
| } |
| k += nc * kc; |
| if XNN_UNPREDICTABLE(b != NULL) { |
| b += nc; |
| } |
| } while (--g != 0); |
| } |
| |
| void xnn_pack_qs8_gemm_xw_goi_w( |
| size_t g, |
| size_t nc, |
| size_t kc, |
| size_t nr, |
| size_t kr, |
| size_t sr, |
| const int8_t* k, |
| const int32_t* b, |
| void* packed_w, |
| size_t extra_bytes, |
| const struct xnn_qs8_packing_params* params) |
| { |
| assert(sr == 1); |
| const int32_t izp = (int32_t) params->input_zero_point; |
| do { |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| int32_t* packed_b = (int32_t*) packed_w; |
| if XNN_LIKELY(b != NULL) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset]; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } |
| } else { |
| size_t n = nr_block_size; |
| do { |
| *((int32_t*) packed_w) = 0; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } while (--n != 0); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t)); |
| for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) { |
| const size_t kr_block_size = min(kc - kr_block_start, kr); |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| int32_t ksum = 0; |
| for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) { |
| const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)]; |
| ksum += (int32_t) kv; |
| *((int16_t*) packed_w) = (int16_t) kv; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int16_t)); |
| } |
| packed_b[nr_block_offset] -= ksum * izp; |
| packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int16_t)); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int16_t)); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + extra_bytes); |
| } |
| k += nc * kc; |
| if XNN_UNPREDICTABLE(b != NULL) { |
| b += nc; |
| } |
| } while (--g != 0); |
| } |
| |
| void xnn_pack_f32_gemm_io_w( |
| size_t nc, |
| size_t kc, |
| size_t nr, |
| size_t kr, |
| size_t sr, |
| const float* k, |
| const float* b, |
| float* packed_w, |
| const void* params) |
| { |
| const size_t skr = sr * kr; |
| const size_t skc = round_down_po2(kc, skr); |
| const size_t sr_mask = (sr - 1) * kr; |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| if XNN_LIKELY(b != NULL) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset]; |
| } |
| } |
| packed_w += nr; |
| |
| for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) { |
| *packed_w++ = |
| k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)]; |
| } |
| } |
| packed_w += (nr - nr_block_size) * kr; |
| } |
| |
| for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) { |
| const size_t kr_block_size = min(kc - kr_block_start, kr); |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) { |
| *packed_w++ = |
| k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)]; |
| } |
| packed_w += kr - kr_block_size; |
| } |
| packed_w += (nr - nr_block_size) * kr; |
| } |
| } |
| } |
| |
| void xnn_pack_f16_gemm_io_w( |
| size_t nc, |
| size_t kc, |
| size_t nr, |
| size_t kr, |
| size_t sr, |
| const uint16_t* k, |
| const uint16_t* b, |
| uint16_t* packed_w, |
| const void* params) |
| { |
| const size_t skr = sr * kr; |
| const size_t skc = round_down_po2(kc, skr); |
| const size_t sr_mask = (sr - 1) * kr; |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| if XNN_LIKELY(b != NULL) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset]; |
| } |
| } |
| packed_w += nr; |
| |
| for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) { |
| *packed_w++ = |
| k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)]; |
| } |
| } |
| packed_w += (nr - nr_block_size) * kr; |
| } |
| |
| for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) { |
| const size_t kr_block_size = min(kc - kr_block_start, kr); |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) { |
| *packed_w++ = |
| k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)]; |
| } |
| packed_w += kr - kr_block_size; |
| } |
| packed_w += (nr - nr_block_size) * kr; |
| } |
| } |
| } |
| |
| void xnn_pack_qu8_gemm_io_w( |
| size_t nc, |
| size_t kc, |
| size_t nr, |
| size_t kr, |
| size_t sr, |
| const uint8_t* k, |
| const int32_t* b, |
| void* packed_w, |
| const struct xnn_qu8_packing_params* params) |
| { |
| assert(sr == 1); |
| const int32_t izp = (int32_t) params->input_zero_point; |
| const int32_t boff = (int32_t) kc * izp * (int32_t) params->kernel_zero_point; |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| int32_t* packed_b = (int32_t*) packed_w; |
| if XNN_LIKELY(b != NULL) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } |
| } else { |
| size_t n = nr_block_size; |
| do { |
| *((int32_t*) packed_w) = boff; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } while (--n != 0); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t)); |
| for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) { |
| const size_t kr_block_size = min(kc - kr_block_start, kr); |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| int32_t ksum = 0; |
| for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) { |
| const uint8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)]; |
| ksum += (int32_t) kv; |
| *((uint8_t*) packed_w) = kv; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t)); |
| } |
| packed_b[nr_block_offset] -= ksum * izp; |
| packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t)); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t)); |
| } |
| } |
| } |
| |
| void xnn_pack_qs8_gemm_io_w( |
| size_t nc, |
| size_t kc, |
| size_t nr, |
| size_t kr, |
| size_t sr, |
| const int8_t* k, |
| const int32_t* b, |
| void* packed_w, |
| const struct xnn_qs8_packing_params* params) |
| { |
| assert(sr == 1); |
| const int32_t izp = (int32_t) params->input_zero_point; |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| int32_t* packed_b = (int32_t*) packed_w; |
| if XNN_LIKELY(b != NULL) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset]; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } |
| } else { |
| size_t n = nr_block_size; |
| do { |
| *((int32_t*) packed_w) = 0; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } while (--n != 0); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t)); |
| for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) { |
| const size_t kr_block_size = min(kc - kr_block_start, kr); |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| int32_t ksum = 0; |
| for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) { |
| const int8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)]; |
| ksum += (int32_t) kv; |
| *((int8_t*) packed_w) = kv; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t)); |
| } |
| packed_b[nr_block_offset] -= ksum * izp; |
| packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t)); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t)); |
| } |
| } |
| } |
| |
| void xnn_pack_f32_conv_goki_w( |
| size_t g, |
| size_t nc, |
| size_t ks, |
| size_t kc, |
| size_t nr, |
| size_t kr, |
| size_t sr, |
| const float* k, |
| const float* b, |
| float* packed_w, |
| size_t extra_bytes, |
| const void* params) |
| { |
| const size_t skr = sr * kr; |
| const size_t skc = round_down_po2(kc, skr); |
| const size_t sr_mask = (sr - 1) * kr; |
| do { |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| if XNN_LIKELY(b != NULL) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset]; |
| } |
| } |
| packed_w += nr; |
| |
| for (size_t ki = 0; ki < ks; ki++) { |
| for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) { |
| *packed_w++ = |
| k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset]; |
| } |
| } |
| packed_w += (nr - nr_block_size) * kr; |
| } |
| |
| for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) { |
| const size_t kr_block_size = min(kc - kr_block_start, kr); |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) { |
| *packed_w++ = |
| k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)]; |
| } |
| packed_w += kr - kr_block_size; |
| } |
| packed_w += (nr - nr_block_size) * kr; |
| } |
| } |
| packed_w = (float*) ((uintptr_t) packed_w + extra_bytes); |
| } |
| k += ks * kc * nc; |
| if XNN_UNPREDICTABLE(b != NULL) { |
| b += nc; |
| } |
| } while (--g != 0); |
| } |
| |
| void xnn_pack_f16_conv_goki_w( |
| size_t g, |
| size_t nc, |
| size_t ks, |
| size_t kc, |
| size_t nr, |
| size_t kr, |
| size_t sr, |
| const uint16_t* k, |
| const uint16_t* b, |
| uint16_t* packed_w, |
| size_t extra_bytes, |
| const void* params) |
| { |
| const size_t skr = sr * kr; |
| const size_t skc = round_down_po2(kc, skr); |
| const size_t sr_mask = (sr - 1) * kr; |
| do { |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| if XNN_LIKELY(b != NULL) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset]; |
| } |
| } |
| packed_w += nr; |
| |
| for (size_t ki = 0; ki < ks; ki++) { |
| for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) { |
| *packed_w++ = |
| k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset]; |
| } |
| } |
| packed_w += (nr - nr_block_size) * kr; |
| } |
| |
| for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) { |
| const size_t kr_block_size = min(kc - kr_block_start, kr); |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) { |
| *packed_w++ = |
| k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)]; |
| } |
| packed_w += kr - kr_block_size; |
| } |
| packed_w += (nr - nr_block_size) * kr; |
| } |
| } |
| packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes); |
| } |
| k += ks * kc * nc; |
| if XNN_UNPREDICTABLE(b != NULL) { |
| b += nc; |
| } |
| } while (--g != 0); |
| } |
| |
| void xnn_pack_qu8_conv_goki_w( |
| size_t g, |
| size_t nc, |
| size_t ks, |
| size_t kc, |
| size_t nr, |
| size_t kr, |
| size_t sr, |
| const uint8_t* k, |
| const int32_t* b, |
| void* packed_w, |
| size_t extra_bytes, |
| const struct xnn_qu8_packing_params* params) |
| { |
| assert(sr == 1); |
| const int32_t izp = (int32_t) params->input_zero_point; |
| const int32_t boff = (int32_t) ks * (int32_t) kc * izp * (int32_t) params->kernel_zero_point; |
| do { |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| int32_t* packed_b = (int32_t*) packed_w; |
| if XNN_LIKELY(b != NULL) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } |
| } else { |
| size_t n = nr_block_size; |
| do { |
| *((int32_t*) packed_w) = boff; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } while (--n != 0); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t)); |
| for (size_t ki = 0; ki < ks; ki++) { |
| for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) { |
| const size_t kr_block_size = min(kc - kr_block_start, kr); |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| int32_t ksum = 0; |
| for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) { |
| const uint8_t kv = |
| k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)]; |
| ksum += (int32_t) kv; |
| *((uint8_t*) packed_w) = kv; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t)); |
| } |
| packed_b[nr_block_offset] -= ksum * izp; |
| packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t)); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t)); |
| } |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + extra_bytes); |
| } |
| k += ks * kc * nc; |
| if XNN_UNPREDICTABLE(b != NULL) { |
| b += nc; |
| } |
| } while (--g != 0); |
| } |
| |
| void xnn_pack_qs8_conv_goki_w( |
| size_t g, |
| size_t nc, |
| size_t ks, |
| size_t kc, |
| size_t nr, |
| size_t kr, |
| size_t sr, |
| const int8_t* k, |
| const int32_t* b, |
| void* packed_w, |
| size_t extra_bytes, |
| const struct xnn_qs8_packing_params* params) |
| { |
| assert(sr == 1); |
| const int32_t izp = (int32_t) params->input_zero_point; |
| do { |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| int32_t* packed_b = (int32_t*) packed_w; |
| if XNN_LIKELY(b != NULL) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset]; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } |
| } else { |
| size_t n = nr_block_size; |
| do { |
| *((int32_t*) packed_w) = 0; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } while (--n != 0); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t)); |
| for (size_t ki = 0; ki < ks; ki++) { |
| for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) { |
| const size_t kr_block_size = min(kc - kr_block_start, kr); |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| int32_t ksum = 0; |
| for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) { |
| const int8_t kv = |
| k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)]; |
| ksum += (int32_t) kv; |
| *((int8_t*) packed_w) = kv; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t)); |
| } |
| packed_b[nr_block_offset] -= ksum * izp; |
| packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(int8_t)); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t)); |
| } |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + extra_bytes); |
| } |
| k += ks * kc * nc; |
| if XNN_UNPREDICTABLE(b != NULL) { |
| b += nc; |
| } |
| } while (--g != 0); |
| } |
| |
| void xnn_pack_f32_conv_kgo_w( |
| size_t g, |
| size_t nc, |
| size_t ks, |
| size_t nr, |
| size_t kr, |
| const float* k, |
| const float* b, |
| float* packed_w, |
| const void* params) |
| { |
| for (size_t i = 0; i < g; i++) { |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| if XNN_LIKELY(b != NULL) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset]; |
| } |
| } |
| packed_w += nr; |
| for (size_t ki = 0; ki < ks; ki++) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| *packed_w = |
| k[ki * g * nc + (nr_block_start + nr_block_offset)]; |
| packed_w += kr; |
| } |
| packed_w += (nr - nr_block_size) * kr; |
| } |
| } |
| k += nc; |
| if XNN_UNPREDICTABLE(b != NULL) { |
| b += nc; |
| } |
| } |
| } |
| |
| void xnn_pack_f16_conv_kgo_w( |
| size_t g, |
| size_t nc, |
| size_t ks, |
| size_t nr, |
| size_t kr, |
| const uint16_t* k, |
| const uint16_t* b, |
| uint16_t* packed_w, |
| const void* params) |
| { |
| for (size_t i = 0; i < g; i++) { |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| if XNN_LIKELY(b != NULL) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset]; |
| } |
| } |
| packed_w += nr; |
| for (size_t ki = 0; ki < ks; ki++) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| *packed_w = |
| k[ki * g * nc + (nr_block_start + nr_block_offset)]; |
| packed_w += kr; |
| } |
| packed_w += (nr - nr_block_size) * kr; |
| } |
| } |
| k += nc; |
| if XNN_UNPREDICTABLE(b != NULL) { |
| b += nc; |
| } |
| } |
| } |
| |
| void xnn_pack_qu8_conv_kgo_w( |
| size_t g, |
| size_t nc, |
| size_t ks, |
| size_t nr, |
| size_t kr, |
| const uint8_t* k, |
| const int32_t* b, |
| void* packed_w, |
| const struct xnn_qu8_packing_params* params) |
| { |
| const int32_t izp = (int32_t) params->input_zero_point; |
| const int32_t boff = (int32_t) ks * izp * (int32_t) params->kernel_zero_point; |
| for (size_t i = 0; i < g; i++) { |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| int32_t* packed_b = (int32_t*) packed_w; |
| if XNN_LIKELY(b != NULL) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } |
| } else { |
| size_t n = nr_block_size; |
| do { |
| *((int32_t*) packed_w) = boff; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } while (--n != 0); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t)); |
| for (size_t ki = 0; ki < ks; ki++) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| const uint8_t kv = |
| k[ki * g * nc + (nr_block_start + nr_block_offset)]; |
| *((uint8_t*) packed_w) = kv; |
| packed_b[nr_block_offset] -= (int32_t) kv * izp; |
| packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(uint8_t)); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t)); |
| } |
| } |
| k += nc; |
| if XNN_UNPREDICTABLE(b != NULL) { |
| b += nc; |
| } |
| } |
| } |
| |
| void xnn_pack_qs8_conv_kgo_w( |
| size_t g, |
| size_t nc, |
| size_t ks, |
| size_t nr, |
| size_t kr, |
| const int8_t* k, |
| const int32_t* b, |
| void* packed_w, |
| const struct xnn_qs8_packing_params* params) |
| { |
| const int32_t izp = (int32_t) params->input_zero_point; |
| for (size_t i = 0; i < g; i++) { |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| int32_t* packed_b = (int32_t*) packed_w; |
| if XNN_LIKELY(b != NULL) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset]; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } |
| } else { |
| size_t n = nr_block_size; |
| do { |
| *((int32_t*) packed_w) = 0; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } while (--n != 0); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t)); |
| for (size_t ki = 0; ki < ks; ki++) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| const int8_t kv = |
| k[ki * g * nc + (nr_block_start + nr_block_offset)]; |
| *((int8_t*) packed_w) = kv; |
| packed_b[nr_block_offset] -= (int32_t) kv * izp; |
| packed_w = (void*) ((uintptr_t) packed_w + kr * sizeof(int8_t)); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(int8_t)); |
| } |
| } |
| k += nc; |
| if XNN_UNPREDICTABLE(b != NULL) { |
| b += nc; |
| } |
| } |
| } |
| |
| void xnn_pack_f32_deconv_goki_w( |
| size_t g, |
| size_t nc, |
| size_t kh, |
| size_t kw, |
| size_t kc, |
| size_t sh, |
| size_t sw, |
| size_t nr, |
| size_t kr, |
| size_t sr, |
| const float* k, |
| const float* b, |
| float* packed_w, |
| struct subconvolution_params* subconv_params, |
| const void* params) |
| { |
| const size_t skr = sr * kr; |
| const size_t skc = round_down_po2(kc, skr); |
| const size_t sr_mask = (sr - 1) * kr; |
| for (size_t i = 0; i < g; i++) { |
| for (size_t oy = 0; oy < sh; oy++) { |
| for (size_t ox = 0; ox < sw; ox++) { |
| if (i == 0) { |
| (*subconv_params++).weights = packed_w; |
| } |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| if XNN_LIKELY(b != NULL) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset]; |
| } |
| } |
| packed_w += nr; |
| for (size_t ky = oy; ky < kh; ky += sh) { |
| for (size_t kx = ox; kx < kw; kx += sw) { |
| for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) { |
| *packed_w++ = |
| k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset]; |
| } |
| } |
| packed_w += (nr - nr_block_size) * kr; |
| } |
| |
| for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) { |
| const size_t kr_block_size = min(kc - kr_block_start, kr); |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) { |
| *packed_w++ = |
| k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)]; |
| } |
| packed_w += kr - kr_block_size; |
| } |
| packed_w += (nr - nr_block_size) * kr; |
| } |
| } |
| } |
| } |
| } |
| } |
| k += kh * kw * kc * nc; |
| if XNN_UNPREDICTABLE(b != NULL) { |
| b += nc; |
| } |
| } |
| } |
| |
| void xnn_pack_f16_deconv_goki_w( |
| size_t g, |
| size_t nc, |
| size_t kh, |
| size_t kw, |
| size_t kc, |
| size_t sh, |
| size_t sw, |
| size_t nr, |
| size_t kr, |
| size_t sr, |
| const uint16_t* k, |
| const uint16_t* b, |
| uint16_t* packed_w, |
| struct subconvolution_params* subconv_params, |
| const void* params) |
| { |
| const size_t skr = sr * kr; |
| const size_t skc = round_down_po2(kc, skr); |
| const size_t sr_mask = (sr - 1) * kr; |
| for (size_t i = 0; i < g; i++) { |
| for (size_t oy = 0; oy < sh; oy++) { |
| for (size_t ox = 0; ox < sw; ox++) { |
| if (i == 0) { |
| (*subconv_params++).weights = packed_w; |
| } |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| if XNN_LIKELY(b != NULL) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset]; |
| } |
| } |
| packed_w += nr; |
| for (size_t ky = oy; ky < kh; ky += sh) { |
| for (size_t kx = ox; kx < kw; kx += sw) { |
| for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) { |
| *packed_w++ = |
| k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset]; |
| } |
| } |
| packed_w += (nr - nr_block_size) * kr; |
| } |
| |
| for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) { |
| const size_t kr_block_size = min(kc - kr_block_start, kr); |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) { |
| *packed_w++ = |
| k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)]; |
| } |
| packed_w += kr - kr_block_size; |
| } |
| packed_w += (nr - nr_block_size) * kr; |
| } |
| } |
| } |
| } |
| } |
| } |
| k += kh * kw * kc * nc; |
| if XNN_UNPREDICTABLE(b != NULL) { |
| b += nc; |
| } |
| } |
| } |
| |
| void xnn_pack_qu8_deconv_goki_w( |
| size_t g, |
| size_t nc, |
| size_t kh, |
| size_t kw, |
| size_t kc, |
| size_t sh, |
| size_t sw, |
| size_t nr, |
| size_t kr, |
| size_t sr, |
| const uint8_t* k, |
| const int32_t* b, |
| void* packed_w, |
| struct subconvolution_params* subconv_params, |
| const struct xnn_qu8_packing_params* params) |
| { |
| assert(sr == 1); |
| const int32_t izp = (int32_t) params->input_zero_point; |
| const int32_t kzp = (int32_t) params->kernel_zero_point; |
| for (size_t i = 0; i < g; i++) { |
| for (size_t oy = 0; oy < sh; oy++) { |
| for (size_t ox = 0; ox < sw; ox++) { |
| if (i == 0) { |
| (*subconv_params++).weights = packed_w; |
| } |
| const int32_t boff = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * izp * kzp; |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| int32_t* packed_b = (int32_t*) packed_w; |
| if XNN_LIKELY(b != 0) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| *((int32_t*) packed_w) = b[nr_block_start + nr_block_offset] + boff; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } |
| } else { |
| size_t n = nr_block_size; |
| do { |
| *((int32_t*) packed_w) = boff; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } while (--n != 0); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t)); |
| for (size_t ky = oy; ky < kh; ky += sh) { |
| for (size_t kx = ox; kx < kw; kx += sw) { |
| for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) { |
| const size_t kr_block_size = min(kc - kr_block_start, kr); |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| int32_t ksum = 0; |
| for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) { |
| const uint8_t kv = |
| k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)]; |
| ksum += (int32_t) kv; |
| *((uint8_t*) packed_w) = kv; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t)); |
| } |
| packed_b[nr_block_offset] -= ksum * izp; |
| packed_w = (void*) ((uintptr_t) packed_w + (kr - kr_block_size) * sizeof(uint8_t)); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * kr * sizeof(uint8_t)); |
| } |
| } |
| } |
| } |
| } |
| } |
| k += kh * kw * kc * nc; |
| if XNN_UNPREDICTABLE(b != NULL) { |
| b += nc; |
| } |
| } |
| } |
| |
| void xnn_pack_f32_dwconv_ghw_w( |
| size_t h, |
| size_t w, |
| size_t c, |
| size_t cr, |
| const float* k, |
| const float* b, |
| float* packed_w, |
| size_t extra_bytes, |
| const void* params) |
| { |
| for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) { |
| const size_t cr_block_size = min(c - cr_block_start, cr); |
| if XNN_LIKELY(b != NULL) { |
| for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) { |
| *packed_w++ = b[cr_block_start + cr_block_offset]; |
| } |
| } else { |
| size_t n = cr_block_size; |
| do { |
| *packed_w++ = 0.0f; |
| } while (--n != 0); |
| } |
| packed_w += cr - cr_block_size; |
| for (size_t x = 0; x < w; x++) { |
| for (size_t y = 0; y < h; y++) { |
| for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) { |
| const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x]; |
| *packed_w++ = kv; |
| } |
| packed_w += cr - cr_block_size; |
| } |
| } |
| packed_w = (float*) ((uintptr_t) packed_w + extra_bytes); |
| } |
| } |
| |
| void xnn_pack_f16_dwconv_ghw_w( |
| size_t h, |
| size_t w, |
| size_t c, |
| size_t cr, |
| const uint16_t* k, |
| const uint16_t* b, |
| uint16_t* packed_w, |
| size_t extra_bytes, |
| const void* params) |
| { |
| for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) { |
| const size_t cr_block_size = min(c - cr_block_start, cr); |
| if XNN_LIKELY(b != NULL) { |
| for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) { |
| *packed_w++ = b[cr_block_start + cr_block_offset]; |
| } |
| } else { |
| size_t n = cr_block_size; |
| do { |
| *packed_w++ = 0; |
| } while (--n != 0); |
| } |
| packed_w += cr - cr_block_size; |
| for (size_t x = 0; x < w; x++) { |
| for (size_t y = 0; y < h; y++) { |
| for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) { |
| const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x]; |
| *packed_w++ = kv; |
| } |
| packed_w += cr - cr_block_size; |
| } |
| } |
| packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes); |
| } |
| } |
| |
| void xnn_pack_qu8_dwconv_ghw_w( |
| size_t h, |
| size_t w, |
| size_t c, |
| size_t cr, |
| const uint8_t* k, |
| const int32_t* b, |
| void* packed_w, |
| size_t extra_bytes, |
| const struct xnn_qu8_packing_params* params) |
| { |
| const int32_t izp = (int32_t) params->input_zero_point; |
| const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point; |
| for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) { |
| const size_t cr_block_size = min(c - cr_block_start, cr); |
| int32_t* packed_b = (int32_t*) packed_w; |
| if XNN_LIKELY(b != NULL) { |
| for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) { |
| *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } |
| } else { |
| size_t n = cr_block_size; |
| do { |
| *((int32_t*) packed_w) = boff; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } while (--n != 0); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t)); |
| for (size_t x = 0; x < w; x++) { |
| for (size_t y = 0; y < h; y++) { |
| for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) { |
| const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x]; |
| packed_b[cr_block_offset] -= (int32_t) kv * izp; |
| *((uint8_t*) packed_w) = kv; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t)); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t)); |
| } |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + extra_bytes); |
| } |
| } |
| |
| void xnn_pack_qs8_dwconv_ghw_w( |
| size_t h, |
| size_t w, |
| size_t c, |
| size_t cr, |
| const int8_t* k, |
| const int32_t* b, |
| void* packed_w, |
| size_t extra_bytes, |
| const struct xnn_qs8_packing_params* params) |
| { |
| const int32_t izp = (int32_t) params->input_zero_point; |
| for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) { |
| const size_t cr_block_size = min(c - cr_block_start, cr); |
| int32_t* packed_b = (int32_t*) packed_w; |
| if XNN_LIKELY(b != NULL) { |
| for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) { |
| *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset]; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } |
| } else { |
| size_t n = cr_block_size; |
| do { |
| *((int32_t*) packed_w) = 0; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } while (--n != 0); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t)); |
| for (size_t x = 0; x < w; x++) { |
| for (size_t y = 0; y < h; y++) { |
| for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) { |
| const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x]; |
| packed_b[cr_block_offset] -= (int32_t) kv * izp; |
| *((int8_t*) packed_w) = kv; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t)); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t)); |
| } |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + extra_bytes); |
| } |
| } |
| |
| void xnn_pack_f32_dwconv_hwg_w( |
| size_t h, |
| size_t w, |
| size_t c, |
| size_t cr, |
| const float* k, |
| const float* b, |
| float* packed_w, |
| const void* params) |
| { |
| for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) { |
| const size_t cr_block_size = min(c - cr_block_start, cr); |
| if XNN_LIKELY(b != NULL) { |
| for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) { |
| *packed_w++ = b[cr_block_start + cr_block_offset]; |
| } |
| } else { |
| size_t n = cr_block_size; |
| do { |
| *packed_w++ = 0.0f; |
| } while (--n != 0); |
| } |
| packed_w += cr - cr_block_size; |
| for (size_t x = 0; x < w; x++) { |
| for (size_t y = 0; y < h; y++) { |
| for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) { |
| const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)]; |
| *packed_w++ = kv; |
| } |
| packed_w += cr - cr_block_size; |
| } |
| } |
| } |
| } |
| |
| void xnn_pack_f16_dwconv_hwg_w( |
| size_t h, |
| size_t w, |
| size_t c, |
| size_t cr, |
| const uint16_t* k, |
| const uint16_t* b, |
| uint16_t* packed_w, |
| const void* params) |
| { |
| for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) { |
| const size_t cr_block_size = min(c - cr_block_start, cr); |
| if XNN_LIKELY(b != NULL) { |
| for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) { |
| *packed_w++ = b[cr_block_start + cr_block_offset]; |
| } |
| } else { |
| size_t n = cr_block_size; |
| do { |
| *packed_w++ = 0; |
| } while (--n != 0); |
| } |
| packed_w += cr - cr_block_size; |
| for (size_t x = 0; x < w; x++) { |
| for (size_t y = 0; y < h; y++) { |
| for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) { |
| const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)]; |
| *packed_w++ = kv; |
| } |
| packed_w += cr - cr_block_size; |
| } |
| } |
| } |
| } |
| |
| void xnn_pack_qu8_dwconv_hwg_w( |
| size_t h, |
| size_t w, |
| size_t c, |
| size_t cr, |
| const uint8_t* k, |
| const int32_t* b, |
| void* packed_w, |
| const struct xnn_qu8_packing_params* params) |
| { |
| const int32_t izp = (int32_t) params->input_zero_point; |
| const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point; |
| for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) { |
| const size_t cr_block_size = min(c - cr_block_start, cr); |
| int32_t* packed_b = (int32_t*) packed_w; |
| if XNN_LIKELY(b != NULL) { |
| for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) { |
| *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset] + boff; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } |
| } else { |
| size_t n = cr_block_size; |
| do { |
| *((int32_t*) packed_w) = boff; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } while (--n != 0); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t)); |
| for (size_t x = 0; x < w; x++) { |
| for (size_t y = 0; y < h; y++) { |
| for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) { |
| const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)]; |
| packed_b[cr_block_offset] -= (int32_t) kv * izp; |
| *((uint8_t*) packed_w) = kv; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t)); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t)); |
| } |
| } |
| } |
| } |
| |
| void xnn_pack_qs8_dwconv_hwg_w( |
| size_t h, |
| size_t w, |
| size_t c, |
| size_t cr, |
| const int8_t* k, |
| const int32_t* b, |
| void* packed_w, |
| const struct xnn_qs8_packing_params* params) |
| { |
| const int32_t izp = (int32_t) params->input_zero_point; |
| for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) { |
| const size_t cr_block_size = min(c - cr_block_start, cr); |
| int32_t* packed_b = (int32_t*) packed_w; |
| if XNN_LIKELY(b != NULL) { |
| for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) { |
| *((int32_t*) packed_w) = b[cr_block_start + cr_block_offset]; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } |
| } else { |
| size_t n = cr_block_size; |
| do { |
| *((int32_t*) packed_w) = 0; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t)); |
| } while (--n != 0); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t)); |
| for (size_t x = 0; x < w; x++) { |
| for (size_t y = 0; y < h; y++) { |
| for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) { |
| const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)]; |
| packed_b[cr_block_offset] -= (int32_t) kv * izp; |
| *((int8_t*) packed_w) = kv; |
| packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t)); |
| } |
| packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t)); |
| } |
| } |
| } |
| } |
| |
| void xnn_pack_f32_gemminc_goi_w( |
| size_t g, |
| size_t nc, |
| size_t kc, |
| size_t nr, |
| size_t kr, |
| size_t sr, |
| const float* k, |
| float* packed_w, |
| const void* params) |
| { |
| const size_t skr = sr * kr; |
| const size_t skc = round_down_po2(kc, skr); |
| const size_t sr_mask = (sr - 1) * kr; |
| do { |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| |
| for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) { |
| *packed_w++ = |
| k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset]; |
| } |
| } |
| packed_w += (nr - nr_block_size) * kr; |
| } |
| |
| for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) { |
| const size_t kr_block_size = min(kc - kr_block_start, kr); |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) { |
| *packed_w++ = |
| k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)]; |
| } |
| packed_w += kr - kr_block_size; |
| } |
| packed_w += (nr - nr_block_size) * kr; |
| } |
| } |
| k += nc * kc; |
| } while (--g != 0); |
| } |
| |
| void xnn_pack_f16_gemminc_goi_w( |
| size_t g, |
| size_t nc, |
| size_t kc, |
| size_t nr, |
| size_t kr, |
| size_t sr, |
| const uint16_t* k, |
| uint16_t* packed_w, |
| const void* params) |
| { |
| const size_t skr = sr * kr; |
| const size_t skc = round_down_po2(kc, skr); |
| const size_t sr_mask = (sr - 1) * kr; |
| do { |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| |
| for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) { |
| *packed_w++ = |
| k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset]; |
| } |
| } |
| packed_w += (nr - nr_block_size) * kr; |
| } |
| |
| for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) { |
| const size_t kr_block_size = min(kc - kr_block_start, kr); |
| for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) { |
| for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) { |
| *packed_w++ = |
| k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)]; |
| } |
| packed_w += kr - kr_block_size; |
| } |
| packed_w += (nr - nr_block_size) * kr; |
| } |
| } |
| k += nc * kc; |
| } while (--g != 0); |
| } |
| |
| void xnn_pack_f32_dconv_oki_w( |
| size_t nc, |
| size_t kc, |
| size_t nr, |
| size_t kh, |
| size_t kw, |
| const float* k, |
| const float* b, |
| float* packed_w, |
| const void* params) |
| { |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| if XNN_LIKELY(b != NULL) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) { |
| *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)]; |
| } |
| } else { |
| size_t n = nr; |
| do { |
| *packed_w++ = 0.0f; |
| } while (--n != 0); |
| } |
| |
| for (size_t kx = 0; kx < kw; kx++) { |
| for (size_t c = 0; c < kc; c++) { |
| for (size_t ky = 0; ky < kh; ky++) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) { |
| *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c]; |
| } |
| } |
| } |
| } |
| if XNN_UNPREDICTABLE(b != NULL) { |
| b += nr; |
| } |
| } |
| } |
| |
| void xnn_pack_f16_dconv_oki_w( |
| size_t nc, |
| size_t kc, |
| size_t nr, |
| size_t kh, |
| size_t kw, |
| const uint16_t* k, |
| const uint16_t* b, |
| uint16_t* packed_w, |
| const void* params) |
| { |
| for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) { |
| const size_t nr_block_size = min(nc - nr_block_start, nr); |
| if XNN_LIKELY(b != NULL) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) { |
| *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)]; |
| } |
| } else { |
| size_t n = nr; |
| do { |
| *packed_w++ = 0; |
| } while (--n != 0); |
| } |
| |
| for (size_t kx = 0; kx < kw; kx++) { |
| for (size_t c = 0; c < kc; c++) { |
| for (size_t ky = 0; ky < kh; ky++) { |
| for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) { |
| *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c]; |
| } |
| } |
| } |
| } |
| if XNN_UNPREDICTABLE(b != NULL) { |
| b += nr; |
| } |
| } |
| } |
| |
| void xnn_pack_f32_chw_dwconv_ghw_w( |
| size_t kernel_size, |
| size_t groups, |
| const float* kernel, |
| const float* bias, |
| float* packed_weights, |
| const void* params) |
| { |
| for (size_t g = 0; g < groups; g++) { |
| if XNN_LIKELY(bias != NULL) { |
| *packed_weights = *bias++; |
| } else { |
| *packed_weights = 0.0f; |
| } |
| packed_weights += 1; |
| for (size_t i = 0; i < kernel_size; i++) { |
| *packed_weights++ = kernel[g * kernel_size + i]; |
| } |
| } |
| } |
| |
| void xnn_pack_f16_chw_dwconv_ghw_w( |
| size_t kernel_size, |
| size_t groups, |
| const uint16_t* kernel, |
| const uint16_t* bias, |
| uint16_t* packed_weights, |
| const void* params) |
| { |
| for (size_t g = 0; g < groups; g++) { |
| if XNN_LIKELY(bias != NULL) { |
| *packed_weights = *bias++; |
| } else { |
| *packed_weights = 0; |
| } |
| packed_weights += 1; |
| for (size_t i = 0; i < kernel_size; i++) { |
| *packed_weights++ = kernel[g * kernel_size + i]; |
| } |
| } |
| } |
| |
| void xnn_pack_f32_chw_dwconv_hwg_w( |
| size_t kernel_size, |
| size_t groups, |
| const float* kernel, |
| const float* bias, |
| float* packed_weights, |
| const void* params) |
| { |
| for (size_t g = 0; g < groups; g++) { |
| if XNN_LIKELY(bias != NULL) { |
| *packed_weights = *bias++; |
| } else { |
| *packed_weights = 0.0f; |
| } |
| packed_weights += 1; |
| for (size_t i = 0; i < kernel_size; i++) { |
| *packed_weights++ = kernel[i * groups + g]; |
| } |
| } |
| } |
| |
| void xnn_pack_f32_vmulcaddc_w( |
| size_t c, |
| size_t cr, |
| const float* s, |
| const float* b, |
| float* packed_w, |
| const void* params) |
| { |
| for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) { |
| const size_t cr_block_size = min(c - cr_block_start, cr); |
| for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) { |
| *packed_w++ = s[cr_block_start + cr_block_offset]; |
| } |
| packed_w += cr - cr_block_size; |
| if XNN_LIKELY(b != NULL) { |
| for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) { |
| *packed_w++ = b[cr_block_start + cr_block_offset]; |
| } |
| } else { |
| size_t n = cr_block_size; |
| do { |
| *packed_w++ = 0.0f; |
| } while (--n != 0); |
| } |
| packed_w += cr - cr_block_size; |
| } |
| } |
| |
| void xnn_pack_f16_vmulcaddc_w( |
| size_t c, |
| size_t cr, |
| const uint16_t* s, |
| const uint16_t* b, |
| uint16_t* packed_w, |
| const void* params) |
| { |
| for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) { |
| const size_t cr_block_size = min(c - cr_block_start, cr); |
| for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) { |
| *packed_w++ = s[cr_block_start + cr_block_offset]; |
| } |
| packed_w += cr - cr_block_size; |
| if XNN_LIKELY(b != NULL) { |
| for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) { |
| *packed_w++ = b[cr_block_start + cr_block_offset]; |
| } |
| } else { |
| size_t n = cr_block_size; |
| do { |
| *packed_w++ = 0; |
| } while (--n != 0); |
| } |
| packed_w += cr - cr_block_size; |
| } |
| } |