Blame - src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c16.c - platform/external/XNNPACK

blob: 941c6135631cdf0635b2aa817fea646cc5ce22b5 [file] [log] [blame]

Marat Dukhan	8575504	2022-01-13 01:46:05 -0800	[diff] [blame]	1	// Auto-generated file. Do not edit!
				2	// Template: src/qs8-gavgpool/multipass-neon.c.in
				3	// Generator: tools/xngen
				4	//
				5	// Copyright 2020 Google LLC
				6	//
				7	// This source code is licensed under the BSD-style license found in the
				8	// LICENSE file in the root directory of this source tree.
				9
				10	#include <assert.h>
				11
				12	#include <arm_neon.h>
				13
				14	#include <xnnpack/gavgpool.h>
				15	#include <xnnpack/math.h>
				16
				17
				18	void xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16(
				19	size_t rows,
				20	size_t channels,
				21	const uint8_t* input,
				22	size_t input_stride,
				23	const uint8_t* zero,
				24	int32_t* buffer,
				25	uint8_t* output,
				26	const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
				27	{
				28	assert(rows > 7);
				29	assert(channels != 0);
				30
				31	const uint8_t* i0 = input;
				32	const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
				33	const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
				34	const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
				35	const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
				36	const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
				37	const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
Marat Dukhan	c7c92b0	2022-01-18 18:53:05 -0800	[diff] [blame]	38	const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(uint8_t);
Marat Dukhan	8575504	2022-01-13 01:46:05 -0800	[diff] [blame]	39
				40	const int32x4_t vinit_bias = vld1q_dup_s32(&params->rndnu_neon.init_bias);
				41	int32_t* b = buffer;
				42	size_t c = channels;
				43	for (; c != 0; c = doz(c, 16)) {
				44	const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
				45	const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
				46	const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
				47	const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
				48
				49	const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
				50	uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
				51	const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
				52	uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
				53
				54	const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
				55	vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
				56	const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
				57	vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
				58	const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
				59	vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
				60	const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
				61	vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
				62	const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
				63	vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
				64	const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
				65	vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
				66	const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
				67	vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
				68	const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
				69	vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
				70	vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
				71	vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
				72
				73	const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
				74	const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
				75	const int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF)));
				76	const int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF)));
				77
				78	vst1q_s32(b, vacc0123); b += 4;
				79	vst1q_s32(b, vacc4567); b += 4;
				80	vst1q_s32(b, vacc89AB); b += 4;
				81	vst1q_s32(b, vaccCDEF); b += 4;
				82	}
				83
				84	for (rows -= 7; rows > 7; rows -= 7) {
				85	i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
				86	i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
				87	i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
				88	i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
				89	i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
				90	i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
				91	i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
				92
				93	int32_t* b = buffer;
				94	size_t c = channels;
				95	for (; c != 0; c = doz(c, 16)) {
				96	const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
				97	const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
				98	const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
				99	const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
				100
				101	const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
				102	uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
				103	const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
				104	uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
				105
				106	const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
				107	vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
				108	const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
				109	vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
				110	const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
				111	vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
				112	const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
				113	vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
				114	const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
				115	vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
				116	const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
				117	vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
				118	const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
				119	vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
				120	const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
				121	vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
				122	int32x4_t vacc0123 = vld1q_s32(b);
				123	int32x4_t vacc4567 = vld1q_s32(b + 4);
				124	vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
				125	int32x4_t vacc89AB = vld1q_s32(b + 8);
				126	int32x4_t vaccCDEF = vld1q_s32(b + 12);
				127	vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
				128
				129	vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
				130	vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
				131	vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF)));
				132	vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF)));
				133
				134	vst1q_s32(b, vacc0123); b += 4;
				135	vst1q_s32(b, vacc4567); b += 4;
				136	vst1q_s32(b, vacc89AB); b += 4;
				137	vst1q_s32(b, vaccCDEF); b += 4;
				138	}
				139	}
				140
				141	i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
				142	i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
				143	if XNN_UNPREDICTABLE(rows < 2) {
				144	i1 = zero;
				145	}
				146	i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
				147	if XNN_UNPREDICTABLE(rows <= 2) {
				148	i2 = zero;
				149	}
				150	i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
				151	if XNN_UNPREDICTABLE(rows < 4) {
				152	i3 = zero;
				153	}
				154	i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
				155	if XNN_UNPREDICTABLE(rows <= 4) {
				156	i4 = zero;
				157	}
				158	i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
				159	if XNN_UNPREDICTABLE(rows < 6) {
				160	i5 = zero;
				161	}
				162	i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
				163	if XNN_UNPREDICTABLE(rows <= 6) {
				164	i6 = zero;
				165	}
				166
				167	const int32x4_t vleft_pre_shift = vld1q_dup_s32(&params->rndnu_neon.left_pre_shift);
				168	const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
				169	const int32x4_t vleft_post_shift = vld1q_dup_s32(&params->rndnu_neon.left_post_shift);
				170	const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
				171	const uint8x16_t voutput_min = vld1q_dup_u8(&params->rndnu_neon.output_min);
				172	const uint8x16_t voutput_max = vld1q_dup_u8(&params->rndnu_neon.output_max);
				173	for (; channels >= 16; channels -= 16) {
				174	const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
				175	const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
				176	const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
				177	const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
				178
				179	const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
				180	uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
				181	const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
				182	uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
				183
				184	const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
				185	vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
				186	const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
				187	vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
				188	const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
				189	vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
				190	const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
				191	vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
				192	const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
				193	vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
				194	const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
				195	vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
				196	const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
				197	vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
				198	const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
				199	vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
				200	int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
				201	int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
				202	vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
				203	int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4;
				204	int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4;
				205	vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
				206
				207	vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
				208	vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
				209	vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF)));
				210	vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF)));
				211
				212	vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift);
				213	vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift);
				214	vacc89AB = vqshlq_s32(vacc89AB, vleft_pre_shift);
				215	vaccCDEF = vqshlq_s32(vaccCDEF, vleft_pre_shift);
				216
				217	vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
				218	vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);
				219	vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier);
				220	vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier);
				221
				222	vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift);
				223	vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift);
				224	vacc89AB = vrshlq_s32(vacc89AB, vleft_post_shift);
				225	vaccCDEF = vrshlq_s32(vaccCDEF, vleft_post_shift);
				226
				227	#if XNN_ARCH_ARM64
				228	int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
				229	int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
				230	#else // !XNN_ARCH_ARM64
				231	int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
				232	int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
				233	#endif // !XNN_ARCH_ARM64
				234
				235	vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
				236	vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point);
				237
				238	#if XNN_ARCH_ARM64
				239	uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF);
				240	#else // !XNN_ARCH_ARM64
				241	uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
				242	#endif // !XNN_ARCH_ARM64
				243
				244	vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
				245
				246	vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
				247
				248	vst1q_u8(output, vout0123456789ABCDEF); output += 16;
				249	}
				250	if XNN_UNLIKELY(channels != 0) {
				251	do {
				252	const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
				253	const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
				254	const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
				255	uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
				256
				257	const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
				258	vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
				259	const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
				260	vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
				261	const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
				262	vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
				263	const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
				264	vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
				265	int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
				266	int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
				267	vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
				268
				269	vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
				270	vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
				271
				272	vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift);
				273	vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift);
				274
				275	vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
				276	vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);
				277
				278	vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift);
				279	vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift);
				280
				281	#if XNN_ARCH_ARM64
				282	int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
				283	#else
				284	int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
				285	#endif
				286	vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
				287
				288	uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
				289	vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
				290	vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
				291
				292	if XNN_LIKELY(channels >= 8) {
				293	vst1_u8(output, vout01234567); output += 8;
				294	channels -= 8;
				295	} else {
				296	if (channels & 4) {
				297	vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
				298	vout01234567 = vext_u8(vout01234567, vout01234567, 4);
				299	}
				300	if (channels & 2) {
				301	vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
				302	vout01234567 = vext_u8(vout01234567, vout01234567, 2);
				303	}
				304	if (channels & 1) {
				305	vst1_lane_u8(output, vout01234567, 0); output += 1;
				306	}
				307	channels = 0;
				308	}
				309	} while (channels != 0);
				310	}
				311	}