Blame - src/qs8-dwconv/gen/up8x9-minmax-fp32-neon-mul16.c - platform/external/XNNPACK

blob: f1d8aaf64b09bdadb6e2087497b0ec098a4d6b16 [file] [log] [blame]

Marat Dukhan	6f90529	2021-06-25 11:12:05 -0700	[diff] [blame^]	1	// Auto-generated file. Do not edit!
				2	// Template: src/qs8-dwconv/unipass-neon-mul16.c.in
				3	// Generator: tools/xngen
				4	//
				5	// Copyright 2020 Google LLC
				6	//
				7	// This source code is licensed under the BSD-style license found in the
				8	// LICENSE file in the root directory of this source tree.
				9
				10	#include <assert.h>
				11
				12	#include <arm_neon.h>
				13
				14	#include <xnnpack/dwconv.h>
				15
				16
				17	void xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16(
				18	size_t channels,
				19	size_t output_width,
				20	const int8_t** input,
				21	const void* weights,
				22	int8_t* output,
				23	size_t input_stride,
				24	size_t output_increment,
				25	size_t input_offset,
				26	const int8_t* zero,
				27	const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
				28	{
				29	assert(channels != 0);
				30	assert(output_width != 0);
				31
				32	const float32x4_t vscale = vld1q_dup_f32(&params->fp32_neon.scale);
				33	const float32x4_t voutput_min_less_zero_point = vld1q_dup_f32(&params->fp32_neon.output_min_less_zero_point);
				34	const float32x4_t voutput_max_less_zero_point = vld1q_dup_f32(&params->fp32_neon.output_max_less_zero_point);
				35	const float32x4_t vmagic_bias = vld1q_dup_f32(&params->fp32_neon.magic_bias);
				36	const int32x4_t vmagic_bias_less_zero_point = vld1q_dup_s32(&params->fp32_neon.magic_bias_less_zero_point);
				37	do {
				38	const int8_t* i0 = input[0];
				39	assert(i0 != NULL);
				40	if XNN_UNPREDICTABLE(i0 != zero) {
				41	i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
				42	}
				43	const int8_t* i1 = input[1];
				44	assert(i1 != NULL);
				45	if XNN_UNPREDICTABLE(i1 != zero) {
				46	i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
				47	}
				48	const int8_t* i2 = input[2];
				49	assert(i2 != NULL);
				50	if XNN_UNPREDICTABLE(i2 != zero) {
				51	i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
				52	}
				53	const int8_t* i3 = input[3];
				54	assert(i3 != NULL);
				55	if XNN_UNPREDICTABLE(i3 != zero) {
				56	i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
				57	}
				58	const int8_t* i4 = input[4];
				59	assert(i4 != NULL);
				60	if XNN_UNPREDICTABLE(i4 != zero) {
				61	i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
				62	}
				63	const int8_t* i5 = input[5];
				64	assert(i5 != NULL);
				65	if XNN_UNPREDICTABLE(i5 != zero) {
				66	i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
				67	}
				68	const int8_t* i6 = input[6];
				69	assert(i6 != NULL);
				70	if XNN_UNPREDICTABLE(i6 != zero) {
				71	i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
				72	}
				73	const int8_t* i7 = input[7];
				74	assert(i7 != NULL);
				75	if XNN_UNPREDICTABLE(i7 != zero) {
				76	i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
				77	}
				78	const int8_t* i8 = input[8];
				79	assert(i8 != NULL);
				80	if XNN_UNPREDICTABLE(i8 != zero) {
				81	i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
				82	}
				83	input = (const int8_t**) ((uintptr_t) input + input_stride);
				84
				85	size_t c = channels;
				86	const void* w = weights;
				87	for (; c >= 8; c -= 8) {
				88	int32x4_t vacc0123 = vld1q_s32(w); w = (const void) ((uintptr_t) w + 4 sizeof(int32_t));
				89	int32x4_t vacc4567 = vld1q_s32(w); w = (const void) ((uintptr_t) w + 4 sizeof(int32_t));
				90
				91
				92	const int16x8_t vi0x01234567 = vmovl_s8(vld1_s8(i0)); i0 += 8;
				93	const int16x8_t vk0x01234567 = vmovl_s8(vld1_s8(w)); w = (const void) ((uintptr_t) w + 8 sizeof(int8_t));
				94
				95	vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi0x01234567), vget_low_s16(vk0x01234567));
				96	vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi0x01234567), vget_high_s16(vk0x01234567));
				97
				98	const int16x8_t vi1x01234567 = vmovl_s8(vld1_s8(i1)); i1 += 8;
				99	const int16x8_t vk1x01234567 = vmovl_s8(vld1_s8(w)); w = (const void) ((uintptr_t) w + 8 sizeof(int8_t));
				100
				101	vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi1x01234567), vget_low_s16(vk1x01234567));
				102	vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi1x01234567), vget_high_s16(vk1x01234567));
				103
				104	const int16x8_t vi2x01234567 = vmovl_s8(vld1_s8(i2)); i2 += 8;
				105	const int16x8_t vk2x01234567 = vmovl_s8(vld1_s8(w)); w = (const void) ((uintptr_t) w + 8 sizeof(int8_t));
				106
				107	vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi2x01234567), vget_low_s16(vk2x01234567));
				108	vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi2x01234567), vget_high_s16(vk2x01234567));
				109
				110	const int16x8_t vi3x01234567 = vmovl_s8(vld1_s8(i3)); i3 += 8;
				111	const int16x8_t vk3x01234567 = vmovl_s8(vld1_s8(w)); w = (const void) ((uintptr_t) w + 8 sizeof(int8_t));
				112
				113	vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi3x01234567), vget_low_s16(vk3x01234567));
				114	vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi3x01234567), vget_high_s16(vk3x01234567));
				115
				116	const int16x8_t vi4x01234567 = vmovl_s8(vld1_s8(i4)); i4 += 8;
				117	const int16x8_t vk4x01234567 = vmovl_s8(vld1_s8(w)); w = (const void) ((uintptr_t) w + 8 sizeof(int8_t));
				118
				119	vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567));
				120	vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567));
				121
				122	const int16x8_t vi5x01234567 = vmovl_s8(vld1_s8(i5)); i5 += 8;
				123	const int16x8_t vk5x01234567 = vmovl_s8(vld1_s8(w)); w = (const void) ((uintptr_t) w + 8 sizeof(int8_t));
				124
				125	vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi5x01234567), vget_low_s16(vk5x01234567));
				126	vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi5x01234567), vget_high_s16(vk5x01234567));
				127
				128	const int16x8_t vi6x01234567 = vmovl_s8(vld1_s8(i6)); i6 += 8;
				129	const int16x8_t vk6x01234567 = vmovl_s8(vld1_s8(w)); w = (const void) ((uintptr_t) w + 8 sizeof(int8_t));
				130
				131	vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi6x01234567), vget_low_s16(vk6x01234567));
				132	vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi6x01234567), vget_high_s16(vk6x01234567));
				133
				134	const int16x8_t vi7x01234567 = vmovl_s8(vld1_s8(i7)); i7 += 8;
				135	const int16x8_t vk7x01234567 = vmovl_s8(vld1_s8(w)); w = (const void) ((uintptr_t) w + 8 sizeof(int8_t));
				136
				137	vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi7x01234567), vget_low_s16(vk7x01234567));
				138	vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi7x01234567), vget_high_s16(vk7x01234567));
				139
				140	const int16x8_t vi8x01234567 = vmovl_s8(vld1_s8(i8)); i8 += 8;
				141	const int16x8_t vk8x01234567 = vmovl_s8(vld1_s8(w)); w = (const void) ((uintptr_t) w + 8 sizeof(int8_t));
				142
				143	vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567));
				144	vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567));
				145
				146	float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
				147	float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
				148
				149	vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
				150	vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
				151
				152	vfpacc0123 = vmaxq_f32(vfpacc0123, voutput_min_less_zero_point);
				153	vfpacc4567 = vmaxq_f32(vfpacc4567, voutput_min_less_zero_point);
				154
				155	vfpacc0123 = vminq_f32(vfpacc0123, voutput_max_less_zero_point);
				156	vfpacc4567 = vminq_f32(vfpacc4567, voutput_max_less_zero_point);
				157
				158	vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
				159	vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));
				160
				161	vacc0123 = vsubq_s32(vacc0123, vmagic_bias_less_zero_point);
				162	vacc4567 = vsubq_s32(vacc4567, vmagic_bias_less_zero_point);
				163
				164	#if XNN_ARCH_ARM64
				165	const int16x8_t vacc01234567 = vmovn_high_s32(vmovn_s32(vacc0123), vacc4567);
				166
				167	int8x8_t vout01234567 = vmovn_s16(vacc01234567);
				168	#else
				169	const int16x8_t vacc01234567 = vcombine_s16(vmovn_s32(vacc0123), vmovn_s32(vacc4567));
				170
				171	int8x8_t vout01234567 = vmovn_s16(vacc01234567);
				172	#endif
				173
				174
				175	vst1_s8(output, vout01234567); output += 8;
				176	}
				177	if XNN_UNLIKELY(c != 0) {
				178	{
				179	int32x4_t vacc0123 = vld1q_s32(w); w = (const void) ((uintptr_t) w + 4 sizeof(int32_t));
				180	int32x4_t vacc4567 = vld1q_s32(w); w = (const void) ((uintptr_t) w + 4 sizeof(int32_t));
				181
				182	const int16x8_t vi0x01234567 = vmovl_s8(vld1_s8(i0));
				183	const int16x8_t vk0x01234567 = vmovl_s8(vld1_s8(w));
				184
				185	vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi0x01234567), vget_low_s16(vk0x01234567));
				186	vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi0x01234567), vget_high_s16(vk0x01234567));
				187	const int16x8_t vi1x01234567 = vmovl_s8(vld1_s8(i1));
				188	const int16x8_t vk1x01234567 = vmovl_s8(vld1_s8((const void) ((uintptr_t) w + 8 sizeof(int8_t))));
				189
				190	vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi1x01234567), vget_low_s16(vk1x01234567));
				191	vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi1x01234567), vget_high_s16(vk1x01234567));
				192	const int16x8_t vi2x01234567 = vmovl_s8(vld1_s8(i2));
				193	const int16x8_t vk2x01234567 = vmovl_s8(vld1_s8((const void) ((uintptr_t) w + 16 sizeof(int8_t))));
				194
				195	vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi2x01234567), vget_low_s16(vk2x01234567));
				196	vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi2x01234567), vget_high_s16(vk2x01234567));
				197	const int16x8_t vi3x01234567 = vmovl_s8(vld1_s8(i3));
				198	const int16x8_t vk3x01234567 = vmovl_s8(vld1_s8((const void) ((uintptr_t) w + 24 sizeof(int8_t))));
				199
				200	vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi3x01234567), vget_low_s16(vk3x01234567));
				201	vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi3x01234567), vget_high_s16(vk3x01234567));
				202	const int16x8_t vi4x01234567 = vmovl_s8(vld1_s8(i4));
				203	const int16x8_t vk4x01234567 = vmovl_s8(vld1_s8((const void) ((uintptr_t) w + 32 sizeof(int8_t))));
				204
				205	vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567));
				206	vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567));
				207	const int16x8_t vi5x01234567 = vmovl_s8(vld1_s8(i5));
				208	const int16x8_t vk5x01234567 = vmovl_s8(vld1_s8((const void) ((uintptr_t) w + 40 sizeof(int8_t))));
				209
				210	vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi5x01234567), vget_low_s16(vk5x01234567));
				211	vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi5x01234567), vget_high_s16(vk5x01234567));
				212	const int16x8_t vi6x01234567 = vmovl_s8(vld1_s8(i6));
				213	const int16x8_t vk6x01234567 = vmovl_s8(vld1_s8((const void) ((uintptr_t) w + 48 sizeof(int8_t))));
				214
				215	vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi6x01234567), vget_low_s16(vk6x01234567));
				216	vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi6x01234567), vget_high_s16(vk6x01234567));
				217	const int16x8_t vi7x01234567 = vmovl_s8(vld1_s8(i7));
				218	const int16x8_t vk7x01234567 = vmovl_s8(vld1_s8((const void) ((uintptr_t) w + 56 sizeof(int8_t))));
				219
				220	vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi7x01234567), vget_low_s16(vk7x01234567));
				221	vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi7x01234567), vget_high_s16(vk7x01234567));
				222	const int16x8_t vi8x01234567 = vmovl_s8(vld1_s8(i8));
				223	const int16x8_t vk8x01234567 = vmovl_s8(vld1_s8((const void) ((uintptr_t) w + 64 sizeof(int8_t))));
				224
				225	vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567));
				226	vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567));
				227
				228	float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
				229	float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
				230
				231	vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
				232	vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
				233
				234	vfpacc0123 = vmaxq_f32(vfpacc0123, voutput_min_less_zero_point);
				235	vfpacc4567 = vmaxq_f32(vfpacc4567, voutput_min_less_zero_point);
				236
				237	vfpacc0123 = vminq_f32(vfpacc0123, voutput_max_less_zero_point);
				238	vfpacc4567 = vminq_f32(vfpacc4567, voutput_max_less_zero_point);
				239
				240	vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
				241	vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));
				242
				243	vacc0123 = vsubq_s32(vacc0123, vmagic_bias_less_zero_point);
				244	vacc4567 = vsubq_s32(vacc4567, vmagic_bias_less_zero_point);
				245
				246	#if XNN_ARCH_ARM64
				247	const int16x8_t vacc01234567 = vmovn_high_s32(vmovn_s32(vacc0123), vacc4567);
				248	int8x8_t vout01234567 = vmovn_s16(vacc01234567);
				249	#else
				250	const int16x8_t vacc01234567 = vcombine_s16(vmovn_s32(vacc0123), vmovn_s32(vacc4567));
				251	int8x8_t vout01234567 = vmovn_s16(vacc01234567);
				252	#endif
				253
				254
				255	if (c & 4) {
				256	vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_s8(vout01234567), 0); output += 4;
				257	vout01234567 = vext_s8(vout01234567, vout01234567, 4);
				258	}
				259	if (c & 2) {
				260	vst1_lane_u16(__builtin_assume_aligned(output, 1), vreinterpret_u16_s8(vout01234567), 0); output += 2;
				261	vout01234567 = vext_s8(vout01234567, vout01234567, 2);
				262	}
				263	if (c & 1) {
				264	vst1_lane_s8(output, vout01234567, 0); output += 1;
				265	}
				266	}
				267	}
				268
				269	output = (int8_t*) ((uintptr_t) output + output_increment);
				270	} while (--output_width != 0);
				271	}