Blame - src/packing.c - platform/external/XNNPACK

blob: d5357c3dc647dd5da2a4920a35c151aed41c1b94 [file] [log] [blame]

Marat Dukhan	ab58238	2020-07-06 13:32:08 -0700	[diff] [blame^]	1	// Copyright (c) Facebook, Inc. and its affiliates.
				2	// All rights reserved.
				3	//
				4	// Copyright 2019 Google LLC
				5	//
				6	// This source code is licensed under the BSD-style license found in the
				7	// LICENSE file in the root directory of this source tree.
				8
				9	#include <stdint.h>
				10	#include <stddef.h>
				11
				12	#include <xnnpack/math.h>
				13	#include <xnnpack/pack.h>
				14
				15
				16	void xnn_pack_q8_gemm_goi_w(
				17	size_t g,
				18	size_t nc,
				19	size_t kc,
				20	uint32_t nr,
				21	uint32_t kr,
				22	uint8_t izp,
				23	uint8_t kzp,
				24	const uint8_t* k,
				25	const int32_t* b,
				26	void* packed_w)
				27	{
				28	const int32_t boff = (int32_t) kc * (int32_t) izp * (int32_t) kzp;
				29	do {
				30	for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
				31	const size_t nr_block_size = min(nc - nr_block_start, nr);
				32	int32_t* packed_b = (int32_t*) packed_w;
				33	if XNN_LIKELY(b != NULL) {
				34	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				35	((int32_t) packed_w) = b[nr_block_start + nr_block_offset] + boff;
				36	packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
				37	}
				38	} else {
				39	size_t n = nr_block_size;
				40	do {
				41	((int32_t) packed_w) = boff;
				42	packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
				43	} while (--n != 0);
				44	}
				45	packed_w = (void) ((uintptr_t) packed_w + (nr - nr_block_size) sizeof(int32_t));
				46	for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
				47	const size_t kr_block_size = min(kc - kr_block_start, kr);
				48	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				49	int32_t ksum = 0;
				50	for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
				51	const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
				52	ksum += (int32_t) kv;
				53	((uint8_t) packed_w) = kv;
				54	packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
				55	}
				56	packed_b[nr_block_offset] -= ksum * (int32_t) izp;
				57	packed_w = (void) ((uintptr_t) packed_w + (kr - kr_block_size) sizeof(uint8_t));
				58	}
				59	packed_w = (void) ((uintptr_t) packed_w + (nr - nr_block_size) kr * sizeof(uint8_t));
				60	}
				61	}
				62	k += nc * kc;
				63	if XNN_UNPREDICTABLE(b != NULL) {
				64	b += nc;
				65	}
				66	} while (--g != 0);
				67	}
				68
				69	void xnn_pack_q8_gemm_io_w(
				70	size_t nc,
				71	size_t kc,
				72	uint32_t nr,
				73	uint32_t kr,
				74	uint8_t izp,
				75	uint8_t kzp,
				76	const uint8_t* k,
				77	const int32_t* b,
				78	void* packed_w)
				79	{
				80	const int32_t boff = (int32_t) kc * (int32_t) izp * (int32_t) kzp;
				81	for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
				82	const size_t nr_block_size = min(nc - nr_block_start, nr);
				83	int32_t* packed_b = (int32_t*) packed_w;
				84	if XNN_LIKELY(b != NULL) {
				85	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				86	((int32_t) packed_w) = b[nr_block_start + nr_block_offset] + boff;
				87	packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
				88	}
				89	} else {
				90	size_t n = nr_block_size;
				91	do {
				92	((int32_t) packed_w) = boff;
				93	packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
				94	} while (--n != 0);
				95	}
				96	packed_w = (void) ((uintptr_t) packed_w + (nr - nr_block_size) sizeof(int32_t));
				97	for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
				98	const size_t kr_block_size = min(kc - kr_block_start, kr);
				99	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				100	int32_t ksum = 0;
				101	for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
				102	const uint8_t kv = k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
				103	ksum += (int32_t) kv;
				104	((uint8_t) packed_w) = kv;
				105	packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
				106	}
				107	packed_b[nr_block_offset] -= ksum * (int32_t) izp;
				108	packed_w = (void) ((uintptr_t) packed_w + (kr - kr_block_size) sizeof(uint8_t));
				109	}
				110	packed_w = (void) ((uintptr_t) packed_w + (nr - nr_block_size) kr * sizeof(uint8_t));
				111	}
				112	}
				113	}
				114
				115	void xnn_pack_q8_conv_goki_w(
				116	size_t g,
				117	size_t nc,
				118	size_t ks,
				119	size_t kc,
				120	uint32_t nr,
				121	uint32_t kr,
				122	uint8_t izp,
				123	uint8_t kzp,
				124	const uint8_t* k,
				125	const int32_t* b,
				126	void* packed_w)
				127	{
				128	const int32_t boff = (int32_t) ks * (int32_t) kc * (int32_t) izp * (int32_t) kzp;
				129	do {
				130	for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
				131	const size_t nr_block_size = min(nc - nr_block_start, nr);
				132	int32_t* packed_b = (int32_t*) packed_w;
				133	if XNN_LIKELY(b != NULL) {
				134	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				135	((int32_t) packed_w) = b[nr_block_start + nr_block_offset] + boff;
				136	packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
				137	}
				138	} else {
				139	size_t n = nr_block_size;
				140	do {
				141	((int32_t) packed_w) = boff;
				142	packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
				143	} while (--n != 0);
				144	}
				145	packed_w = (void) ((uintptr_t) packed_w + (nr - nr_block_size) sizeof(int32_t));
				146	for (size_t ki = 0; ki < ks; ki++) {
				147	for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
				148	const size_t kr_block_size = min(kc - kr_block_start, kr);
				149	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				150	int32_t ksum = 0;
				151	for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
				152	const uint8_t kv =
				153	k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
				154	ksum += (int32_t) kv;
				155	((uint8_t) packed_w) = kv;
				156	packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
				157	}
				158	packed_b[nr_block_offset] -= ksum * (int32_t) izp;
				159	packed_w = (void) ((uintptr_t) packed_w + (kr - kr_block_size) sizeof(uint8_t));
				160	}
				161	packed_w = (void) ((uintptr_t) packed_w + (nr - nr_block_size) kr * sizeof(uint8_t));
				162	}
				163	}
				164	}
				165	k += ks * kc * nc;
				166	if XNN_UNPREDICTABLE(b != NULL) {
				167	b += nc;
				168	}
				169	} while (--g != 0);
				170	}
				171
				172	void xnn_pack_q8_conv_kgo_w(
				173	size_t g,
				174	size_t nc,
				175	size_t ks,
				176	uint32_t nr,
				177	uint32_t kr,
				178	uint8_t izp,
				179	uint8_t kzp,
				180	const uint8_t* k,
				181	const int32_t* b,
				182	void* packed_w)
				183	{
				184	const int32_t boff = (int32_t) ks * (int32_t) izp * (int32_t) kzp;
				185	for (size_t i = 0; i < g; i++) {
				186	for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
				187	const size_t nr_block_size = min(nc - nr_block_start, nr);
				188	int32_t* packed_b = (int32_t*) packed_w;
				189	if XNN_LIKELY(b != NULL) {
				190	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				191	((int32_t) packed_w) = b[nr_block_start + nr_block_offset] + boff;
				192	packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
				193	}
				194	} else {
				195	size_t n = nr_block_size;
				196	do {
				197	((int32_t) packed_w) = boff;
				198	packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
				199	} while (--n != 0);
				200	}
				201	packed_w = (void) ((uintptr_t) packed_w + (nr - nr_block_size) sizeof(int32_t));
				202	for (size_t ki = 0; ki < ks; ki++) {
				203	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				204	const uint8_t kv =
				205	k[ki * g * nc + (nr_block_start + nr_block_offset)];
				206	((uint8_t) packed_w) = kv;
				207	packed_b[nr_block_offset] -= (int32_t) kv * (int32_t) izp;
				208	packed_w = (void) ((uintptr_t) packed_w + kr sizeof(uint8_t));
				209	}
				210	packed_w = (void) ((uintptr_t) packed_w + (nr - nr_block_size) kr * sizeof(uint8_t));
				211	}
				212	}
				213	k += nc;
				214	if XNN_UNPREDICTABLE(b != NULL) {
				215	b += nc;
				216	}
				217	}
				218	}
				219
				220	void xnn_pack_q8_deconv_goki_w(
				221	size_t g,
				222	size_t nc,
				223	size_t kh,
				224	size_t kw,
				225	size_t kc,
				226	size_t sh,
				227	size_t sw,
				228	size_t nr,
				229	size_t kr,
				230	uint8_t izp,
				231	uint8_t kzp,
				232	const uint8_t* k,
				233	const int32_t* b,
				234	void* packed_w,
				235	struct subconvolution_params* params)
				236	{
				237	for (size_t i = 0; i < g; i++) {
				238	for (size_t oy = 0; oy < sh; oy++) {
				239	for (size_t ox = 0; ox < sw; ox++) {
				240	if (i == 0) {
				241	(*params++).weights = packed_w;
				242	}
				243	const int32_t boff = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * (int32_t) izp * (int32_t) kzp;
				244	for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
				245	const size_t nr_block_size = min(nc - nr_block_start, nr);
				246	int32_t* packed_b = (int32_t*) packed_w;
				247	if XNN_LIKELY(b != 0) {
				248	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				249	((int32_t) packed_w) = b[nr_block_start + nr_block_offset] + boff;
				250	packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
				251	}
				252	} else {
				253	size_t n = nr_block_size;
				254	do {
				255	((int32_t) packed_w) = boff;
				256	packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
				257	} while (--n != 0);
				258	}
				259	packed_w = (void) ((uintptr_t) packed_w + (nr - nr_block_size) sizeof(int32_t));
				260	for (size_t ky = oy; ky < kh; ky += sh) {
				261	for (size_t kx = ox; kx < kw; kx += sw) {
				262	for (size_t kr_block_start = 0; kr_block_start < kc; kr_block_start += kr) {
				263	const size_t kr_block_size = min(kc - kr_block_start, kr);
				264	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				265	int32_t ksum = 0;
				266	for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
				267	const uint8_t kv =
				268	k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
				269	ksum += (int32_t) kv;
				270	((uint8_t) packed_w) = kv;
				271	packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
				272	}
				273	packed_b[nr_block_offset] -= ksum * (int32_t) izp;
				274	packed_w = (void) ((uintptr_t) packed_w + (kr - kr_block_size) sizeof(uint8_t));
				275	}
				276	packed_w = (void) ((uintptr_t) packed_w + (nr - nr_block_size) kr * sizeof(uint8_t));
				277	}
				278	}
				279	}
				280	}
				281	}
				282	}
				283	k += kh * kw * kc * nc;
				284	if XNN_UNPREDICTABLE(b != NULL) {
				285	b += nc;
				286	}
				287	}
				288	}
				289
				290	void xnn_pack_q8_dwconv_ghw_w(
				291	size_t h,
				292	size_t w,
				293	size_t c,
				294	size_t cr,
				295	uint8_t izp,
				296	uint8_t kzp,
				297	const uint8_t* k,
				298	const int32_t* b,
				299	void* packed_w)
				300	{
				301	const int32_t boff = (int32_t) h * (int32_t) w * (int32_t) izp * (int32_t) kzp;
				302	for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
				303	const size_t cr_block_size = min(c - cr_block_start, cr);
				304	int32_t* packed_b = (int32_t*) packed_w;
				305	if XNN_LIKELY(b != NULL) {
				306	for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
				307	((int32_t) packed_w) = b[cr_block_start + cr_block_offset] + boff;
				308	packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
				309	}
				310	} else {
				311	size_t n = cr_block_size;
				312	do {
				313	((int32_t) packed_w) = boff;
				314	packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
				315	} while (--n != 0);
				316	}
				317	packed_w = (void) ((uintptr_t) packed_w + (cr - cr_block_size) sizeof(int32_t));
				318	for (size_t x = 0; x < w; x++) {
				319	for (size_t y = 0; y < h; y++) {
				320	for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
				321	const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
				322	packed_b[cr_block_offset] -= (int32_t) kv * (int32_t) izp;
				323	((uint8_t) packed_w) = kv;
				324	packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
				325	}
				326	packed_w = (void) ((uintptr_t) packed_w + (cr - cr_block_size) sizeof(uint8_t));
				327	}
				328	}
				329	}
				330	}
				331
				332	void xnn_pack_q8_dwconv_hwg_w(
				333	size_t h,
				334	size_t w,
				335	size_t c,
				336	size_t cr,
				337	uint8_t izp,
				338	uint8_t kzp,
				339	const uint8_t* k,
				340	const int32_t* b,
				341	void* packed_w)
				342	{
				343	const int32_t boff = (int32_t) h * (int32_t) w * (int32_t) izp * (int32_t) kzp;
				344	for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
				345	const size_t cr_block_size = min(c - cr_block_start, cr);
				346	int32_t* packed_b = (int32_t*) packed_w;
				347	if XNN_LIKELY(b != NULL) {
				348	for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
				349	((int32_t) packed_w) = b[cr_block_start + cr_block_offset] + boff;
				350	packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
				351	}
				352	} else {
				353	size_t n = cr_block_size;
				354	do {
				355	((int32_t) packed_w) = boff;
				356	packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
				357	} while (--n != 0);
				358	}
				359	packed_w = (void) ((uintptr_t) packed_w + (cr - cr_block_size) sizeof(int32_t));
				360	for (size_t x = 0; x < w; x++) {
				361	for (size_t y = 0; y < h; y++) {
				362	for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
				363	const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
				364	packed_b[cr_block_offset] -= (int32_t) kv * (int32_t) izp;
				365	((uint8_t) packed_w) = kv;
				366	packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
				367	}
				368	packed_w = (void) ((uintptr_t) packed_w + (cr - cr_block_size) sizeof(uint8_t));
				369	}
				370	}
				371	}
				372	}
				373
				374	void xnn_pack_f16_gemm_goi_w(
				375	size_t g,
				376	size_t nc,
				377	size_t kc,
				378	size_t nr,
				379	size_t kr,
				380	size_t sr,
				381	const uint16_t* k,
				382	const uint16_t* b,
				383	uint16_t* packed_w)
				384	{
				385	const size_t skr = sr * kr;
				386	const size_t skc = round_down_po2(kc, skr);
				387	const size_t sr_mask = (sr - 1) * kr;
				388	do {
				389	for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
				390	const size_t nr_block_size = min(nc - nr_block_start, nr);
				391	if XNN_LIKELY(b != NULL) {
				392	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				393	packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
				394	}
				395	}
				396	packed_w += nr;
				397
				398	for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
				399	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				400	for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
				401	*packed_w++ =
				402	k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
				403	}
				404	}
				405	packed_w += (nr - nr_block_size) * kr;
				406	}
				407
				408	for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
				409	const size_t kr_block_size = min(kc - kr_block_start, kr);
				410	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				411	for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
				412	*packed_w++ =
				413	k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
				414	}
				415	packed_w += kr - kr_block_size;
				416	}
				417	packed_w += (nr - nr_block_size) * kr;
				418	}
				419	}
				420	k += nc * kc;
				421	if XNN_UNPREDICTABLE(b != NULL) {
				422	b += nc;
				423	}
				424	} while (--g != 0);
				425	}
				426
				427	void xnn_pack_f16_gemm_io_w(
				428	size_t nc,
				429	size_t kc,
				430	size_t nr,
				431	size_t kr,
				432	size_t sr,
				433	const uint16_t* k,
				434	const uint16_t* b,
				435	uint16_t* packed_w)
				436	{
				437	const size_t skr = sr * kr;
				438	const size_t skc = round_down_po2(kc, skr);
				439	const size_t sr_mask = (sr - 1) * kr;
				440	for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
				441	const size_t nr_block_size = min(nc - nr_block_start, nr);
				442	if XNN_LIKELY(b != NULL) {
				443	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				444	packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
				445	}
				446	}
				447	packed_w += nr;
				448
				449	for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
				450	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				451	for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
				452	*packed_w++ =
				453	k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
				454	}
				455	}
				456	packed_w += (nr - nr_block_size) * kr;
				457	}
				458
				459	for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
				460	const size_t kr_block_size = min(kc - kr_block_start, kr);
				461	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				462	for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
				463	*packed_w++ =
				464	k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
				465	}
				466	packed_w += kr - kr_block_size;
				467	}
				468	packed_w += (nr - nr_block_size) * kr;
				469	}
				470	}
				471	}
				472
				473	void xnn_pack_f16_gemminc_goi_w(
				474	size_t g,
				475	size_t nc,
				476	size_t kc,
				477	size_t nr,
				478	size_t kr,
				479	size_t sr,
				480	const uint16_t* k,
				481	uint16_t* packed_w)
				482	{
				483	const size_t skr = sr * kr;
				484	const size_t skc = round_down_po2(kc, skr);
				485	const size_t sr_mask = (sr - 1) * kr;
				486	do {
				487	for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
				488	const size_t nr_block_size = min(nc - nr_block_start, nr);
				489
				490	for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
				491	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				492	for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
				493	*packed_w++ =
				494	k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
				495	}
				496	}
				497	packed_w += (nr - nr_block_size) * kr;
				498	}
				499
				500	for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
				501	const size_t kr_block_size = min(kc - kr_block_start, kr);
				502	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				503	for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
				504	*packed_w++ =
				505	k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
				506	}
				507	packed_w += kr - kr_block_size;
				508	}
				509	packed_w += (nr - nr_block_size) * kr;
				510	}
				511	}
				512	k += nc * kc;
				513	} while (--g != 0);
				514	}
				515
				516	void xnn_pack_f16_conv_goki_w(
				517	size_t g,
				518	size_t nc,
				519	size_t ks,
				520	size_t kc,
				521	size_t nr,
				522	size_t kr,
				523	size_t sr,
				524	const uint16_t* k,
				525	const uint16_t* b,
				526	uint16_t* packed_w)
				527	{
				528	const size_t skr = sr * kr;
				529	const size_t skc = round_down_po2(kc, skr);
				530	const size_t sr_mask = (sr - 1) * kr;
				531	do {
				532	for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
				533	const size_t nr_block_size = min(nc - nr_block_start, nr);
				534	if XNN_LIKELY(b != NULL) {
				535	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				536	packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
				537	}
				538	}
				539	packed_w += nr;
				540
				541	for (size_t ki = 0; ki < ks; ki++) {
				542	for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
				543	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				544	for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
				545	*packed_w++ =
				546	k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
				547	}
				548	}
				549	packed_w += (nr - nr_block_size) * kr;
				550	}
				551
				552	for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
				553	const size_t kr_block_size = min(kc - kr_block_start, kr);
				554	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				555	for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
				556	*packed_w++ =
				557	k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
				558	}
				559	packed_w += kr - kr_block_size;
				560	}
				561	packed_w += (nr - nr_block_size) * kr;
				562	}
				563	}
				564	}
				565	k += ks * kc * nc;
				566	if XNN_UNPREDICTABLE(b != NULL) {
				567	b += nc;
				568	}
				569	} while (--g != 0);
				570	}
				571
				572	void xnn_pack_f16_conv_kgo_w(
				573	size_t g,
				574	size_t nc,
				575	size_t ks,
				576	size_t nr,
				577	size_t kr,
				578	const uint16_t* k,
				579	const uint16_t* b,
				580	uint16_t* packed_w)
				581	{
				582	for (size_t i = 0; i < g; i++) {
				583	for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
				584	const size_t nr_block_size = min(nc - nr_block_start, nr);
				585	if XNN_LIKELY(b != NULL) {
				586	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				587	packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
				588	}
				589	}
				590	packed_w += nr;
				591	for (size_t ki = 0; ki < ks; ki++) {
				592	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				593	*packed_w =
				594	k[ki * g * nc + (nr_block_start + nr_block_offset)];
				595	packed_w += kr;
				596	}
				597	packed_w += (nr - nr_block_size) * kr;
				598	}
				599	}
				600	k += nc;
				601	if XNN_UNPREDICTABLE(b != NULL) {
				602	b += nc;
				603	}
				604	}
				605	}
				606
				607	void xnn_pack_f16_dconv_oki_w(
				608	size_t nc,
				609	size_t kc,
				610	size_t nr,
				611	size_t kh,
				612	size_t kw,
				613	const uint16_t* k,
				614	const uint16_t* b,
				615	uint16_t* packed_w)
				616	{
				617	for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
				618	const size_t nr_block_size = min(nc - nr_block_start, nr);
				619	if XNN_LIKELY(b != NULL) {
				620	for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
				621	*packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
				622	}
				623	} else {
				624	size_t n = nr;
				625	do {
				626	*packed_w++ = 0;
				627	} while (--n != 0);
				628	}
				629
				630	for (size_t kx = 0; kx < kw; kx++) {
				631	for (size_t c = 0; c < kc; c++) {
				632	for (size_t ky = 0; ky < kh; ky++) {
				633	for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
				634	packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) kh + ky) * kw + kx) * kc + c];
				635	}
				636	}
				637	}
				638	}
				639	if XNN_UNPREDICTABLE(b != NULL) {
				640	b += nr;
				641	}
				642	}
				643	}
				644
				645	void xnn_pack_f16_deconv_goki_w(
				646	size_t g,
				647	size_t nc,
				648	size_t kh,
				649	size_t kw,
				650	size_t kc,
				651	size_t sh,
				652	size_t sw,
				653	size_t nr,
				654	size_t kr,
				655	size_t sr,
				656	const uint16_t* k,
				657	const uint16_t* b,
				658	uint16_t* packed_w,
				659	struct subconvolution_params* params)
				660	{
				661	const size_t skr = sr * kr;
				662	const size_t skc = round_down_po2(kc, skr);
				663	const size_t sr_mask = (sr - 1) * kr;
				664	for (size_t i = 0; i < g; i++) {
				665	for (size_t oy = 0; oy < sh; oy++) {
				666	for (size_t ox = 0; ox < sw; ox++) {
				667	if (i == 0) {
				668	(*params++).weights = packed_w;
				669	}
				670	for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
				671	const size_t nr_block_size = min(nc - nr_block_start, nr);
				672	if XNN_LIKELY(b != NULL) {
				673	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				674	packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
				675	}
				676	}
				677	packed_w += nr;
				678	for (size_t ky = oy; ky < kh; ky += sh) {
				679	for (size_t kx = ox; kx < kw; kx += sw) {
				680	for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
				681	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				682	for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
				683	*packed_w++ =
				684	k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
				685	}
				686	}
				687	packed_w += (nr - nr_block_size) * kr;
				688	}
				689
				690	for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
				691	const size_t kr_block_size = min(kc - kr_block_start, kr);
				692	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				693	for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
				694	*packed_w++ =
				695	k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
				696	}
				697	packed_w += kr - kr_block_size;
				698	}
				699	packed_w += (nr - nr_block_size) * kr;
				700	}
				701	}
				702	}
				703	}
				704	}
				705	}
				706	k += kh * kw * kc * nc;
				707	if XNN_UNPREDICTABLE(b != NULL) {
				708	b += nc;
				709	}
				710	}
				711	}
				712
				713	void xnn_pack_f16_dwconv_ghw_w(
				714	size_t h,
				715	size_t w,
				716	size_t c,
				717	size_t cr,
				718	const uint16_t* k,
				719	const uint16_t* b,
				720	uint16_t* packed_w)
				721	{
				722	for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
				723	const size_t cr_block_size = min(c - cr_block_start, cr);
				724	if XNN_LIKELY(b != NULL) {
				725	for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
				726	*packed_w++ = b[cr_block_start + cr_block_offset];
				727	}
				728	} else {
				729	size_t n = cr_block_size;
				730	do {
				731	*packed_w++ = 0;
				732	} while (--n != 0);
				733	}
				734	packed_w += cr - cr_block_size;
				735	for (size_t x = 0; x < w; x++) {
				736	for (size_t y = 0; y < h; y++) {
				737	for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
				738	const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
				739	*packed_w++ = kv;
				740	}
				741	packed_w += cr - cr_block_size;
				742	}
				743	}
				744	}
				745	}
				746
				747	void xnn_pack_f16_dwconv_hwg_w(
				748	size_t h,
				749	size_t w,
				750	size_t c,
				751	size_t cr,
				752	const uint16_t* k,
				753	const uint16_t* b,
				754	uint16_t* packed_w)
				755	{
				756	for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
				757	const size_t cr_block_size = min(c - cr_block_start, cr);
				758	if XNN_LIKELY(b != NULL) {
				759	for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
				760	*packed_w++ = b[cr_block_start + cr_block_offset];
				761	}
				762	} else {
				763	size_t n = cr_block_size;
				764	do {
				765	*packed_w++ = 0;
				766	} while (--n != 0);
				767	}
				768	packed_w += cr - cr_block_size;
				769	for (size_t x = 0; x < w; x++) {
				770	for (size_t y = 0; y < h; y++) {
				771	for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
				772	const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
				773	*packed_w++ = kv;
				774	}
				775	packed_w += cr - cr_block_size;
				776	}
				777	}
				778	}
				779	}
				780
				781	void xnn_pack_f16_chw_dwconv_ghw_w(
				782	size_t kernel_size,
				783	size_t groups,
				784	const uint16_t* kernel,
				785	const uint16_t* bias,
				786	uint16_t* packed_weights)
				787	{
				788	for (size_t g = 0; g < groups; g++) {
				789	if XNN_LIKELY(bias != NULL) {
				790	packed_weights = bias++;
				791	} else {
				792	*packed_weights = 0;
				793	}
				794	packed_weights += 1;
				795	for (size_t i = 0; i < kernel_size; i++) {
				796	packed_weights++ = kernel[g kernel_size + i];
				797	}
				798	}
				799	}
				800
				801	void xnn_pack_f32_gemm_goi_w(
				802	size_t g,
				803	size_t nc,
				804	size_t kc,
				805	size_t nr,
				806	size_t kr,
				807	size_t sr,
				808	const float* k,
				809	const float* b,
				810	float* packed_w)
				811	{
				812	const size_t skr = sr * kr;
				813	const size_t skc = round_down_po2(kc, skr);
				814	const size_t sr_mask = (sr - 1) * kr;
				815	do {
				816	for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
				817	const size_t nr_block_size = min(nc - nr_block_start, nr);
				818	if XNN_LIKELY(b != NULL) {
				819	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				820	packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
				821	}
				822	}
				823	packed_w += nr;
				824
				825	for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
				826	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				827	for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
				828	*packed_w++ =
				829	k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
				830	}
				831	}
				832	packed_w += (nr - nr_block_size) * kr;
				833	}
				834
				835	for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
				836	const size_t kr_block_size = min(kc - kr_block_start, kr);
				837	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				838	for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
				839	*packed_w++ =
				840	k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
				841	}
				842	packed_w += kr - kr_block_size;
				843	}
				844	packed_w += (nr - nr_block_size) * kr;
				845	}
				846	}
				847	k += nc * kc;
				848	if XNN_UNPREDICTABLE(b != NULL) {
				849	b += nc;
				850	}
				851	} while (--g != 0);
				852	}
				853
				854	void xnn_pack_f32_gemm_io_w(
				855	size_t nc,
				856	size_t kc,
				857	size_t nr,
				858	size_t kr,
				859	size_t sr,
				860	const float* k,
				861	const float* b,
				862	float* packed_w)
				863	{
				864	const size_t skr = sr * kr;
				865	const size_t skc = round_down_po2(kc, skr);
				866	const size_t sr_mask = (sr - 1) * kr;
				867	for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
				868	const size_t nr_block_size = min(nc - nr_block_start, nr);
				869	if XNN_LIKELY(b != NULL) {
				870	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				871	packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
				872	}
				873	}
				874	packed_w += nr;
				875
				876	for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
				877	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				878	for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
				879	*packed_w++ =
				880	k[(round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
				881	}
				882	}
				883	packed_w += (nr - nr_block_size) * kr;
				884	}
				885
				886	for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
				887	const size_t kr_block_size = min(kc - kr_block_start, kr);
				888	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				889	for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
				890	*packed_w++ =
				891	k[(kr_block_start + kr_block_offset) * nc + (nr_block_start + nr_block_offset)];
				892	}
				893	packed_w += kr - kr_block_size;
				894	}
				895	packed_w += (nr - nr_block_size) * kr;
				896	}
				897	}
				898	}
				899
				900	void xnn_pack_f32_gemminc_goi_w(
				901	size_t g,
				902	size_t nc,
				903	size_t kc,
				904	size_t nr,
				905	size_t kr,
				906	size_t sr,
				907	const float* k,
				908	float* packed_w)
				909	{
				910	const size_t skr = sr * kr;
				911	const size_t skc = round_down_po2(kc, skr);
				912	const size_t sr_mask = (sr - 1) * kr;
				913	do {
				914	for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
				915	const size_t nr_block_size = min(nc - nr_block_start, nr);
				916
				917	for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
				918	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				919	for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
				920	*packed_w++ =
				921	k[(nr_block_start + nr_block_offset) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
				922	}
				923	}
				924	packed_w += (nr - nr_block_size) * kr;
				925	}
				926
				927	for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
				928	const size_t kr_block_size = min(kc - kr_block_start, kr);
				929	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				930	for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
				931	*packed_w++ =
				932	k[(nr_block_start + nr_block_offset) * kc + (kr_block_start + kr_block_offset)];
				933	}
				934	packed_w += kr - kr_block_size;
				935	}
				936	packed_w += (nr - nr_block_size) * kr;
				937	}
				938	}
				939	k += nc * kc;
				940	} while (--g != 0);
				941	}
				942
				943	void xnn_pack_f32_conv_goki_w(
				944	size_t g,
				945	size_t nc,
				946	size_t ks,
				947	size_t kc,
				948	size_t nr,
				949	size_t kr,
				950	size_t sr,
				951	const float* k,
				952	const float* b,
				953	float* packed_w)
				954	{
				955	const size_t skr = sr * kr;
				956	const size_t skc = round_down_po2(kc, skr);
				957	const size_t sr_mask = (sr - 1) * kr;
				958	do {
				959	for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
				960	const size_t nr_block_size = min(nc - nr_block_start, nr);
				961	if XNN_LIKELY(b != NULL) {
				962	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				963	packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
				964	}
				965	}
				966	packed_w += nr;
				967
				968	for (size_t ki = 0; ki < ks; ki++) {
				969	for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
				970	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				971	for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
				972	*packed_w++ =
				973	k[((nr_block_start + nr_block_offset) * ks + ki) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
				974	}
				975	}
				976	packed_w += (nr - nr_block_size) * kr;
				977	}
				978
				979	for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
				980	const size_t kr_block_size = min(kc - kr_block_start, kr);
				981	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				982	for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
				983	*packed_w++ =
				984	k[((nr_block_start + nr_block_offset) * ks + ki) * kc + (kr_block_start + kr_block_offset)];
				985	}
				986	packed_w += kr - kr_block_size;
				987	}
				988	packed_w += (nr - nr_block_size) * kr;
				989	}
				990	}
				991	}
				992	k += ks * kc * nc;
				993	if XNN_UNPREDICTABLE(b != NULL) {
				994	b += nc;
				995	}
				996	} while (--g != 0);
				997	}
				998
				999	void xnn_pack_f32_conv_kgo_w(
				1000	size_t g,
				1001	size_t nc,
				1002	size_t ks,
				1003	size_t nr,
				1004	size_t kr,
				1005	const float* k,
				1006	const float* b,
				1007	float* packed_w)
				1008	{
				1009	for (size_t i = 0; i < g; i++) {
				1010	for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
				1011	const size_t nr_block_size = min(nc - nr_block_start, nr);
				1012	if XNN_LIKELY(b != NULL) {
				1013	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				1014	packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
				1015	}
				1016	}
				1017	packed_w += nr;
				1018	for (size_t ki = 0; ki < ks; ki++) {
				1019	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				1020	*packed_w =
				1021	k[ki * g * nc + (nr_block_start + nr_block_offset)];
				1022	packed_w += kr;
				1023	}
				1024	packed_w += (nr - nr_block_size) * kr;
				1025	}
				1026	}
				1027	k += nc;
				1028	if XNN_UNPREDICTABLE(b != NULL) {
				1029	b += nc;
				1030	}
				1031	}
				1032	}
				1033
				1034	void xnn_pack_f32_dconv_oki_w(
				1035	size_t nc,
				1036	size_t kc,
				1037	size_t nr,
				1038	size_t kh,
				1039	size_t kw,
				1040	const float* k,
				1041	const float* b,
				1042	float* packed_w)
				1043	{
				1044	for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
				1045	const size_t nr_block_size = min(nc - nr_block_start, nr);
				1046	if XNN_LIKELY(b != NULL) {
				1047	for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
				1048	*packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
				1049	}
				1050	} else {
				1051	size_t n = nr;
				1052	do {
				1053	*packed_w++ = 0.0f;
				1054	} while (--n != 0);
				1055	}
				1056
				1057	for (size_t kx = 0; kx < kw; kx++) {
				1058	for (size_t c = 0; c < kc; c++) {
				1059	for (size_t ky = 0; ky < kh; ky++) {
				1060	for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
				1061	packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) kh + ky) * kw + kx) * kc + c];
				1062	}
				1063	}
				1064	}
				1065	}
				1066	if XNN_UNPREDICTABLE(b != NULL) {
				1067	b += nr;
				1068	}
				1069	}
				1070	}
				1071
				1072	void xnn_pack_f32_deconv_goki_w(
				1073	size_t g,
				1074	size_t nc,
				1075	size_t kh,
				1076	size_t kw,
				1077	size_t kc,
				1078	size_t sh,
				1079	size_t sw,
				1080	size_t nr,
				1081	size_t kr,
				1082	size_t sr,
				1083	const float* k,
				1084	const float* b,
				1085	float* packed_w,
				1086	struct subconvolution_params* params)
				1087	{
				1088	const size_t skr = sr * kr;
				1089	const size_t skc = round_down_po2(kc, skr);
				1090	const size_t sr_mask = (sr - 1) * kr;
				1091	for (size_t i = 0; i < g; i++) {
				1092	for (size_t oy = 0; oy < sh; oy++) {
				1093	for (size_t ox = 0; ox < sw; ox++) {
				1094	if (i == 0) {
				1095	(*params++).weights = packed_w;
				1096	}
				1097	for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
				1098	const size_t nr_block_size = min(nc - nr_block_start, nr);
				1099	if XNN_LIKELY(b != NULL) {
				1100	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				1101	packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
				1102	}
				1103	}
				1104	packed_w += nr;
				1105	for (size_t ky = oy; ky < kh; ky += sh) {
				1106	for (size_t kx = ox; kx < kw; kx += sw) {
				1107	for (size_t kr_block_start = 0; kr_block_start < skc; kr_block_start += kr) {
				1108	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				1109	for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
				1110	*packed_w++ =
				1111	k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + round_down_po2(kr_block_start, skr) + ((kr_block_start + nr_block_offset * kr) & sr_mask) + kr_block_offset];
				1112	}
				1113	}
				1114	packed_w += (nr - nr_block_size) * kr;
				1115	}
				1116
				1117	for (size_t kr_block_start = skc; kr_block_start < kc; kr_block_start += kr) {
				1118	const size_t kr_block_size = min(kc - kr_block_start, kr);
				1119	for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
				1120	for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size; kr_block_offset++) {
				1121	*packed_w++ =
				1122	k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + (kr_block_start + kr_block_offset)];
				1123	}
				1124	packed_w += kr - kr_block_size;
				1125	}
				1126	packed_w += (nr - nr_block_size) * kr;
				1127	}
				1128	}
				1129	}
				1130	}
				1131	}
				1132	}
				1133	k += kh * kw * kc * nc;
				1134	if XNN_UNPREDICTABLE(b != NULL) {
				1135	b += nc;
				1136	}
				1137	}
				1138	}
				1139
				1140	void xnn_pack_f32_dwconv_ghw_w(
				1141	size_t h,
				1142	size_t w,
				1143	size_t c,
				1144	size_t cr,
				1145	const float* k,
				1146	const float* b,
				1147	float* packed_w)
				1148	{
				1149	for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
				1150	const size_t cr_block_size = min(c - cr_block_start, cr);
				1151	if XNN_LIKELY(b != NULL) {
				1152	for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
				1153	*packed_w++ = b[cr_block_start + cr_block_offset];
				1154	}
				1155	} else {
				1156	size_t n = cr_block_size;
				1157	do {
				1158	*packed_w++ = 0.0f;
				1159	} while (--n != 0);
				1160	}
				1161	packed_w += cr - cr_block_size;
				1162	for (size_t x = 0; x < w; x++) {
				1163	for (size_t y = 0; y < h; y++) {
				1164	for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
				1165	const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
				1166	*packed_w++ = kv;
				1167	}
				1168	packed_w += cr - cr_block_size;
				1169	}
				1170	}
				1171	}
				1172	}
				1173
				1174	void xnn_pack_f32_dwconv_hwg_w(
				1175	size_t h,
				1176	size_t w,
				1177	size_t c,
				1178	size_t cr,
				1179	const float* k,
				1180	const float* b,
				1181	float* packed_w)
				1182	{
				1183	for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
				1184	const size_t cr_block_size = min(c - cr_block_start, cr);
				1185	if XNN_LIKELY(b != NULL) {
				1186	for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
				1187	*packed_w++ = b[cr_block_start + cr_block_offset];
				1188	}
				1189	} else {
				1190	size_t n = cr_block_size;
				1191	do {
				1192	*packed_w++ = 0.0f;
				1193	} while (--n != 0);
				1194	}
				1195	packed_w += cr - cr_block_size;
				1196	for (size_t x = 0; x < w; x++) {
				1197	for (size_t y = 0; y < h; y++) {
				1198	for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
				1199	const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
				1200	*packed_w++ = kv;
				1201	}
				1202	packed_w += cr - cr_block_size;
				1203	}
				1204	}
				1205	}
				1206	}
				1207
				1208	void xnn_pack_f32_chw_dwconv_ghw_w(
				1209	size_t kernel_size,
				1210	size_t groups,
				1211	const float* kernel,
				1212	const float* bias,
				1213	float* packed_weights)
				1214	{
				1215	for (size_t g = 0; g < groups; g++) {
				1216	if XNN_LIKELY(bias != NULL) {
				1217	packed_weights = bias++;
				1218	} else {
				1219	*packed_weights = 0.0f;
				1220	}
				1221	packed_weights += 1;
				1222	for (size_t i = 0; i < kernel_size; i++) {
				1223	packed_weights++ = kernel[g kernel_size + i];
				1224	}
				1225	}
				1226	}
				1227
				1228	void xnn_pack_f32_chw_dwconv_hwg_w(
				1229	size_t kernel_size,
				1230	size_t groups,
				1231	const float* kernel,
				1232	const float* bias,
				1233	float* packed_weights)
				1234	{
				1235	for (size_t g = 0; g < groups; g++) {
				1236	if XNN_LIKELY(bias != NULL) {
				1237	packed_weights = bias++;
				1238	} else {
				1239	*packed_weights = 0.0f;
				1240	}
				1241	packed_weights += 1;
				1242	for (size_t i = 0; i < kernel_size; i++) {
				1243	packed_weights++ = kernel[i groups + g];
				1244	}
				1245	}
				1246	}
				1247
				1248	void xnn_pack_f32_vmulcaddc_w(
				1249	size_t c,
				1250	size_t cr,
				1251	const float* s,
				1252	const float* b,
				1253	float* packed_w)
				1254	{
				1255	for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
				1256	const size_t cr_block_size = min(c - cr_block_start, cr);
				1257	for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
				1258	*packed_w++ = s[cr_block_start + cr_block_offset];
				1259	}
				1260	packed_w += cr - cr_block_size;
				1261	if XNN_LIKELY(b != NULL) {
				1262	for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
				1263	*packed_w++ = b[cr_block_start + cr_block_offset];
				1264	}
				1265	} else {
				1266	size_t n = cr_block_size;
				1267	do {
				1268	*packed_w++ = 0.0f;
				1269	} while (--n != 0);
				1270	}
				1271	packed_w += cr - cr_block_size;
				1272	}
				1273	}
				1274
				1275	void xnn_pack_f16_vmulcaddc_w(
				1276	size_t c,
				1277	size_t cr,
				1278	const uint16_t* s,
				1279	const uint16_t* b,
				1280	uint16_t* packed_w)
				1281	{
				1282	for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
				1283	const size_t cr_block_size = min(c - cr_block_start, cr);
				1284	for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
				1285	*packed_w++ = s[cr_block_start + cr_block_offset];
				1286	}
				1287	packed_w += cr - cr_block_size;
				1288	if XNN_LIKELY(b != NULL) {
				1289	for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
				1290	*packed_w++ = b[cr_block_start + cr_block_offset];
				1291	}
				1292	} else {
				1293	size_t n = cr_block_size;
				1294	do {
				1295	*packed_w++ = 0;
				1296	} while (--n != 0);
				1297	}
				1298	packed_w += cr - cr_block_size;
				1299	}
				1300	}