Blame - src/f32-dwconv-spchw/5x5s2p2-neonfma.c - platform/external/XNNPACK

blob: 7d48e698cc832bc92215eb7b46ebf4957e2ba552 [file] [log] [blame]

XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	1	// Copyright 2019 Google LLC
				2	//
				3	// This source code is licensed under the BSD-style license found in the
				4	// LICENSE file in the root directory of this source tree.
				5
				6	#include <assert.h>
				7
				8	#include <arm_neon.h>
				9
				10	#include <xnnpack/dwconv.h>
				11	#include <xnnpack/math.h>
				12
				13
				14	void xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma(
				15	size_t m,
				16	size_t n,
				17	const float* input,
				18	const float* weights,
				19	float* output,
				20	size_t input_tuple_stride,
				21	size_t output_tuple_stride,
				22	size_t input_width_stride,
				23	size_t output_width_stride,
				24	const union xnn_f32_spchw_params params[restrict static 1])
				25	{
				26	assert(n != 0);
				27
				28	const uint32x4_t vmask_even = vld1q_u32(params->neon.mask_even);
				29	const uint32x4_t vmask_odd = vld1q_u32(params->neon.mask_odd);
Frank Barchard	fcfdc0e	2019-10-21 15:58:42 -0700	[diff] [blame]	30	const float32x4_t vmax = vld1q_dup_f32(&params->neon.max);
				31	const float32x4_t vmin = vld1q_dup_f32(&params->neon.min);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	32
				33	const size_t input_width_increment_single = input_width_stride * 2 - input_tuple_stride * ( (n - 1) / 4 + 1);
				34	const size_t output_width_increment_single = output_width_stride - (n + 1) / 8 * output_tuple_stride;
				35
Marat Dukhan	80fc932	2019-09-29 21:06:36 -0700	[diff] [blame]	36	// No vertical padding.
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	37	const float* i0 = input;
				38	const float* i1 = (const float*) ((uintptr_t) i0 + input_width_stride);
				39	const float* i2 = (const float*) ((uintptr_t) i1 + input_width_stride);
				40	const float* i3 = (const float*) ((uintptr_t) i2 + input_width_stride);
				41	const float* i4 = (const float*) ((uintptr_t) i3 + input_width_stride);
				42
				43	float* output0 = output;
				44
				45	const float32x4_t vw0123 = vld1q_f32(weights);
				46	const float32x4_t vw4567 = vld1q_f32(weights + 4);
				47	const float32x4_t vw89AB = vld1q_f32(weights + 8);
				48	const float32x4_t vwCDEF = vld1q_f32(weights + 12);
				49	const float32x4_t vwGHIJ = vld1q_f32(weights + 16);
				50	const float32x4_t vwKLMN = vld1q_f32(weights + 20);
				51	const float32x2_t vwOP = vld1_f32( weights + 24);
				52
				53	do {
				54	float32x4_t vi0x0123 = vmovq_n_f32(0.0f);
				55	float32x4_t vi1x0123 = vmovq_n_f32(0.0f);
				56	float32x4_t vi2x0123 = vmovq_n_f32(0.0f);
				57	float32x4_t vi3x0123 = vmovq_n_f32(0.0f);
				58	float32x4_t vi4x0123 = vmovq_n_f32(0.0f);
				59	float32x4_t vi0x4567 = vld1q_f32(i0); i0 = (const float*) ((uintptr_t) i0 + input_tuple_stride);
				60	float32x4_t vi1x4567 = vld1q_f32(i1); i1 = (const float*) ((uintptr_t) i1 + input_tuple_stride);
				61	float32x4_t vi2x4567 = vld1q_f32(i2); i2 = (const float*) ((uintptr_t) i2 + input_tuple_stride);
				62	float32x4_t vi3x4567 = vld1q_f32(i3); i3 = (const float*) ((uintptr_t) i3 + input_tuple_stride);
				63	float32x4_t vi4x4567 = vld1q_f32(i4); i4 = (const float*) ((uintptr_t) i4 + input_tuple_stride);
				64
Erich Elsen	179ac85	2019-11-15 18:17:12 -0800	[diff] [blame]	65	size_t k = n;
				66	for (; k > 8; k -= 8) {
				67	float32x4_t vo468Ap00 = vdupq_laneq_f32(vw0123, 0);
				68
				69	float32x4_t vi0x89AB;
				70	float32x4_t vi1x89AB;
				71	float32x4_t vi2x89AB;
				72	float32x4_t vi3x89AB;
				73	float32x4_t vi4x89AB;
				74
				75	vi0x89AB = vld1q_f32(i0); i0 = (const float*) ((uintptr_t) i0 + input_tuple_stride);
				76	vi1x89AB = vld1q_f32(i1); i1 = (const float*) ((uintptr_t) i1 + input_tuple_stride);
				77	vi2x89AB = vld1q_f32(i2); i2 = (const float*) ((uintptr_t) i2 + input_tuple_stride);
				78	vi3x89AB = vld1q_f32(i3); i3 = (const float*) ((uintptr_t) i3 + input_tuple_stride);
				79	vi4x89AB = vld1q_f32(i4); i4 = (const float*) ((uintptr_t) i4 + input_tuple_stride);
				80
				81	float32x4_t vi0xCDEF;
				82	float32x4_t vi1xCDEF;
				83	float32x4_t vi2xCDEF;
				84	float32x4_t vi3xCDEF;
				85	float32x4_t vi4xCDEF;
				86
				87	vi0xCDEF = vld1q_f32(i0); i0 = (const float*) ((uintptr_t) i0 + input_tuple_stride);
				88	vi1xCDEF = vld1q_f32(i1); i1 = (const float*) ((uintptr_t) i1 + input_tuple_stride);
				89	vi2xCDEF = vld1q_f32(i2); i2 = (const float*) ((uintptr_t) i2 + input_tuple_stride);
				90	vi3xCDEF = vld1q_f32(i3); i3 = (const float*) ((uintptr_t) i3 + input_tuple_stride);
				91	vi4xCDEF = vld1q_f32(i4); i4 = (const float*) ((uintptr_t) i4 + input_tuple_stride);
				92
				93	float32x4_t vi0x468A = vuzp1q_f32(vi0x4567, vi0x89AB);
				94	float32x4_t vi0x579B = vuzp2q_f32(vi0x4567, vi0x89AB);
				95	float32x4_t vi1x468A = vuzp1q_f32(vi1x4567, vi1x89AB);
				96	float32x4_t vi1x579B = vuzp2q_f32(vi1x4567, vi1x89AB);
				97	float32x4_t vi2x468A = vuzp1q_f32(vi2x4567, vi2x89AB);
				98	float32x4_t vi2x579B = vuzp2q_f32(vi2x4567, vi2x89AB);
				99	float32x4_t vi3x468A = vuzp1q_f32(vi3x4567, vi3x89AB);
				100	float32x4_t vi3x579B = vuzp2q_f32(vi3x4567, vi3x89AB);
				101	float32x4_t vi4x468A = vuzp1q_f32(vi4x4567, vi4x89AB);
				102	float32x4_t vi4x579B = vuzp2q_f32(vi4x4567, vi4x89AB);
				103
				104	// middle tap
				105	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x468A, vw0123, 3);
				106	float32x4_t vo468Ap01 = vmulq_laneq_f32(vi1x468A, vw89AB, 0);
				107	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x468A, vwCDEF, 1);
				108	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x468A, vwGHIJ, 2);
				109	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi4x468A, vwKLMN, 3);
				110
				111	// one left
				112	const float32x4_t vi0x3579 = vextq_f32(vi0x0123, vi0x579B, 3);
				113	const float32x4_t vi1x3579 = vextq_f32(vi1x0123, vi1x579B, 3);
				114	const float32x4_t vi2x3579 = vextq_f32(vi2x0123, vi2x579B, 3);
				115	const float32x4_t vi3x3579 = vextq_f32(vi3x0123, vi3x579B, 3);
				116	const float32x4_t vi4x3579 = vextq_f32(vi4x0123, vi4x579B, 3);
				117
				118	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x3579, vw0123, 2);
				119	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi1x3579, vw4567, 3);
				120	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x3579, vwCDEF, 0);
				121	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x3579, vwGHIJ, 1);
				122	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi4x3579, vwKLMN, 2);
				123
				124	// two left
				125	// getting the vector to use for the far left tap is annoying
				126	// as we can't ext anything we currently have to get it.
				127	// To do this, we get a bit ugly. Interpret the float 32x4
				128	// vector as int 64x2. Then left shift by 32. Interpret
				129	// again as float 32x4. Now the right most bits are what we
				130	// want them to be for the following ext.
				131	const float32x4_t vi0x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi0x0123), 32));
				132	const float32x4_t vi1x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi1x0123), 32));
				133	const float32x4_t vi2x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi2x0123), 32));
				134	const float32x4_t vi3x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi3x0123), 32));
				135	const float32x4_t vi4x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi4x0123), 32));
				136
				137	const float32x4_t vi0x2468 = vextq_f32(vi0x0012, vi0x468A, 3);
				138	const float32x4_t vi1x2468 = vextq_f32(vi1x0012, vi1x468A, 3);
				139	const float32x4_t vi2x2468 = vextq_f32(vi2x0012, vi2x468A, 3);
				140	const float32x4_t vi3x2468 = vextq_f32(vi3x0012, vi3x468A, 3);
				141	const float32x4_t vi4x2468 = vextq_f32(vi4x0012, vi4x468A, 3);
				142
				143	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x2468, vw0123, 1);
				144	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi1x2468, vw4567, 2);
				145	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x2468, vw89AB, 3);
				146	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x2468, vwGHIJ, 0);
				147	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi4x2468, vwKLMN, 1);
				148
				149	vi0x0123 = vi0x89AB;
				150	vi1x0123 = vi1x89AB;
				151	vi2x0123 = vi2x89AB;
				152	vi3x0123 = vi3x89AB;
				153	vi4x0123 = vi4x89AB;
				154
				155	// one right
				156	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x579B, vw4567, 0);
				157	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi1x579B, vw89AB, 1);
				158	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x579B, vwCDEF, 2);
				159	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x579B, vwGHIJ, 3);
				160	vo468Ap00 = vfmaq_lane_f32( vo468Ap00, vi4x579B, vwOP, 0);
				161
				162	// two right
				163	const float32x4_t vi0x68AC = vextq_f32(vi0x468A, vi0xCDEF, 1);
				164	const float32x4_t vi1x68AC = vextq_f32(vi1x468A, vi1xCDEF, 1);
				165	const float32x4_t vi2x68AC = vextq_f32(vi2x468A, vi2xCDEF, 1);
				166	const float32x4_t vi3x68AC = vextq_f32(vi3x468A, vi3xCDEF, 1);
				167	const float32x4_t vi4x68AC = vextq_f32(vi4x468A, vi4xCDEF, 1);
				168
				169	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x68AC, vw4567, 1);
				170	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi1x68AC, vw89AB, 2);
				171	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x68AC, vwCDEF, 3);
				172	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x68AC, vwKLMN, 0);
				173	vo468Ap00 = vfmaq_lane_f32( vo468Ap00, vi4x68AC, vwOP, 1);
				174
				175	vi0x4567 = vi0xCDEF;
				176	vi1x4567 = vi1xCDEF;
				177	vi2x4567 = vi2xCDEF;
				178	vi3x4567 = vi3xCDEF;
				179	vi4x4567 = vi4xCDEF;
				180
				181	float32x4_t vo0 = vaddq_f32(vo468Ap00, vo468Ap01);
				182
				183	vo0 = vmaxq_f32(vo0, vmin);
				184	vo0 = vminq_f32(vo0, vmax);
				185
				186	size_t k_tmp = (k + 1) / 2;
				187	if XNN_LIKELY(k_tmp >= 4) {
				188	vst1q_f32(output0, vo0);
				189	output0 = (float*) ((uintptr_t) output0 + output_tuple_stride);
				190	} else {
				191	float* output0_lo = output0;
				192	float32x2_t vo0_lo = vget_low_f32(vo0);
				193	if (k_tmp & 2) {
				194	vst1_f32(output0_lo, vo0_lo); output0_lo += 2;
				195	vo0_lo = vget_high_f32(vo0);
				196	}
				197	if (k_tmp & 1) {
				198	vst1_lane_f32(output0_lo, vo0_lo, 0);
				199	}
				200	}
				201	}
				202
				203	{
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	204	float32x4_t vo468Ap00 = vdupq_laneq_f32(vw0123, 0);
				205
				206	float32x4_t vi0x89AB;
				207	float32x4_t vi1x89AB;
				208	float32x4_t vi2x89AB;
				209	float32x4_t vi3x89AB;
				210	float32x4_t vi4x89AB;
				211
				212	if XNN_LIKELY(k > 4) {
				213	vi0x89AB = vld1q_f32(i0); i0 = (const float*) ((uintptr_t) i0 + input_tuple_stride);
				214	vi1x89AB = vld1q_f32(i1); i1 = (const float*) ((uintptr_t) i1 + input_tuple_stride);
				215	vi2x89AB = vld1q_f32(i2); i2 = (const float*) ((uintptr_t) i2 + input_tuple_stride);
				216	vi3x89AB = vld1q_f32(i3); i3 = (const float*) ((uintptr_t) i3 + input_tuple_stride);
				217	vi4x89AB = vld1q_f32(i4); i4 = (const float*) ((uintptr_t) i4 + input_tuple_stride);
				218	} else {
				219	vi0x89AB = vmovq_n_f32(0.f);
				220	vi1x89AB = vmovq_n_f32(0.f);
				221	vi2x89AB = vmovq_n_f32(0.f);
				222	vi3x89AB = vmovq_n_f32(0.f);
				223	vi4x89AB = vmovq_n_f32(0.f);
				224	}
				225
				226	float32x4_t vi0xCDEF;
				227	float32x4_t vi1xCDEF;
				228	float32x4_t vi2xCDEF;
				229	float32x4_t vi3xCDEF;
				230	float32x4_t vi4xCDEF;
				231
				232	if XNN_LIKELY(k > 8) {
				233	vi0xCDEF = vld1q_f32(i0); i0 = (const float*) ((uintptr_t) i0 + input_tuple_stride);
				234	vi1xCDEF = vld1q_f32(i1); i1 = (const float*) ((uintptr_t) i1 + input_tuple_stride);
				235	vi2xCDEF = vld1q_f32(i2); i2 = (const float*) ((uintptr_t) i2 + input_tuple_stride);
				236	vi3xCDEF = vld1q_f32(i3); i3 = (const float*) ((uintptr_t) i3 + input_tuple_stride);
				237	vi4xCDEF = vld1q_f32(i4); i4 = (const float*) ((uintptr_t) i4 + input_tuple_stride);
				238	} else {
				239	vi0xCDEF = vmovq_n_f32(0.f);
				240	vi1xCDEF = vmovq_n_f32(0.f);
				241	vi2xCDEF = vmovq_n_f32(0.f);
				242	vi3xCDEF = vmovq_n_f32(0.f);
				243	vi4xCDEF = vmovq_n_f32(0.f);
				244	}
				245	float32x4_t vi0x468A = vuzp1q_f32(vi0x4567, vi0x89AB);
				246	float32x4_t vi0x579B = vuzp2q_f32(vi0x4567, vi0x89AB);
				247	float32x4_t vi1x468A = vuzp1q_f32(vi1x4567, vi1x89AB);
				248	float32x4_t vi1x579B = vuzp2q_f32(vi1x4567, vi1x89AB);
				249	float32x4_t vi2x468A = vuzp1q_f32(vi2x4567, vi2x89AB);
				250	float32x4_t vi2x579B = vuzp2q_f32(vi2x4567, vi2x89AB);
				251	float32x4_t vi3x468A = vuzp1q_f32(vi3x4567, vi3x89AB);
				252	float32x4_t vi3x579B = vuzp2q_f32(vi3x4567, vi3x89AB);
				253	float32x4_t vi4x468A = vuzp1q_f32(vi4x4567, vi4x89AB);
				254	float32x4_t vi4x579B = vuzp2q_f32(vi4x4567, vi4x89AB);
				255
Marat Dukhan	e3fad19	2019-11-22 13:01:42 -0800	[diff] [blame^]	256	vi0x468A = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi0x468A)));
				257	vi1x468A = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi1x468A)));
				258	vi2x468A = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi2x468A)));
				259	vi3x468A = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi3x468A)));
				260	vi4x468A = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi4x468A)));
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	261
Erich Elsen	179ac85	2019-11-15 18:17:12 -0800	[diff] [blame]	262	vi0x579B = vreinterpretq_u32_f32(vandq_u32(vmask_odd, vreinterpretq_f32_u32(vi0x579B)));
				263	vi1x579B = vreinterpretq_u32_f32(vandq_u32(vmask_odd, vreinterpretq_f32_u32(vi1x579B)));
				264	vi2x579B = vreinterpretq_u32_f32(vandq_u32(vmask_odd, vreinterpretq_f32_u32(vi2x579B)));
				265	vi3x579B = vreinterpretq_u32_f32(vandq_u32(vmask_odd, vreinterpretq_f32_u32(vi3x579B)));
				266	vi4x579B = vreinterpretq_u32_f32(vandq_u32(vmask_odd, vreinterpretq_f32_u32(vi4x579B)));
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	267
				268	// middle tap
				269	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x468A, vw0123, 3);
				270	float32x4_t vo468Ap01 = vmulq_laneq_f32(vi1x468A, vw89AB, 0);
				271	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x468A, vwCDEF, 1);
				272	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x468A, vwGHIJ, 2);
				273	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi4x468A, vwKLMN, 3);
				274
				275	// one left
				276	const float32x4_t vi0x3579 = vextq_f32(vi0x0123, vi0x579B, 3);
				277	const float32x4_t vi1x3579 = vextq_f32(vi1x0123, vi1x579B, 3);
				278	const float32x4_t vi2x3579 = vextq_f32(vi2x0123, vi2x579B, 3);
				279	const float32x4_t vi3x3579 = vextq_f32(vi3x0123, vi3x579B, 3);
				280	const float32x4_t vi4x3579 = vextq_f32(vi4x0123, vi4x579B, 3);
				281
				282	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x3579, vw0123, 2);
				283	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi1x3579, vw4567, 3);
				284	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x3579, vwCDEF, 0);
				285	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x3579, vwGHIJ, 1);
				286	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi4x3579, vwKLMN, 2);
				287
				288	// two left
				289	// getting the vector to use for the far left tap is annoying
				290	// as we can't ext anything we currently have to get it.
				291	// To do this, we get a bit ugly. Interpret the float 32x4
				292	// vector as int 64x2. Then left shift by 32. Interpret
				293	// again as float 32x4. Now the right most bits are what we
				294	// want them to be for the following ext.
				295	const float32x4_t vi0x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi0x0123), 32));
				296	const float32x4_t vi1x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi1x0123), 32));
				297	const float32x4_t vi2x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi2x0123), 32));
				298	const float32x4_t vi3x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi3x0123), 32));
				299	const float32x4_t vi4x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi4x0123), 32));
				300
				301	const float32x4_t vi0x2468 = vextq_f32(vi0x0012, vi0x468A, 3);
				302	const float32x4_t vi1x2468 = vextq_f32(vi1x0012, vi1x468A, 3);
				303	const float32x4_t vi2x2468 = vextq_f32(vi2x0012, vi2x468A, 3);
				304	const float32x4_t vi3x2468 = vextq_f32(vi3x0012, vi3x468A, 3);
				305	const float32x4_t vi4x2468 = vextq_f32(vi4x0012, vi4x468A, 3);
				306
				307	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x2468, vw0123, 1);
				308	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi1x2468, vw4567, 2);
				309	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x2468, vw89AB, 3);
				310	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x2468, vwGHIJ, 0);
				311	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi4x2468, vwKLMN, 1);
				312
				313	vi0x0123 = vi0x89AB;
				314	vi1x0123 = vi1x89AB;
				315	vi2x0123 = vi2x89AB;
				316	vi3x0123 = vi3x89AB;
				317	vi4x0123 = vi4x89AB;
				318
				319	// one right
				320	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x579B, vw4567, 0);
				321	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi1x579B, vw89AB, 1);
				322	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x579B, vwCDEF, 2);
				323	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x579B, vwGHIJ, 3);
				324	vo468Ap00 = vfmaq_lane_f32( vo468Ap00, vi4x579B, vwOP, 0);
				325
				326	// two right
				327	const float32x4_t vi0x68AC = vextq_f32(vi0x468A, vi0xCDEF, 1);
				328	const float32x4_t vi1x68AC = vextq_f32(vi1x468A, vi1xCDEF, 1);
				329	const float32x4_t vi2x68AC = vextq_f32(vi2x468A, vi2xCDEF, 1);
				330	const float32x4_t vi3x68AC = vextq_f32(vi3x468A, vi3xCDEF, 1);
				331	const float32x4_t vi4x68AC = vextq_f32(vi4x468A, vi4xCDEF, 1);
				332
				333	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x68AC, vw4567, 1);
				334	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi1x68AC, vw89AB, 2);
				335	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x68AC, vwCDEF, 3);
				336	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x68AC, vwKLMN, 0);
				337	vo468Ap00 = vfmaq_lane_f32( vo468Ap00, vi4x68AC, vwOP, 1);
				338
				339	vi0x4567 = vi0xCDEF;
				340	vi1x4567 = vi1xCDEF;
				341	vi2x4567 = vi2xCDEF;
				342	vi3x4567 = vi3xCDEF;
				343	vi4x4567 = vi4xCDEF;
				344
				345	float32x4_t vo0 = vaddq_f32(vo468Ap00, vo468Ap01);
				346
Frank Barchard	fcfdc0e	2019-10-21 15:58:42 -0700	[diff] [blame]	347	vo0 = vmaxq_f32(vo0, vmin);
				348	vo0 = vminq_f32(vo0, vmax);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	349
				350	size_t k_tmp = (k + 1) / 2;
				351	if XNN_LIKELY(k_tmp >= 4) {
				352	vst1q_f32(output0, vo0);
				353	output0 = (float*) ((uintptr_t) output0 + output_tuple_stride);
				354	} else {
				355	float* output0_lo = output0;
				356	float32x2_t vo0_lo = vget_low_f32(vo0);
				357	if (k_tmp & 2) {
				358	vst1_f32(output0_lo, vo0_lo); output0_lo += 2;
				359	vo0_lo = vget_high_f32(vo0);
				360	}
				361	if (k_tmp & 1) {
				362	vst1_lane_f32(output0_lo, vo0_lo, 0);
				363	}
				364	}
				365	}
				366
				367	i0 = (const float*) ((uintptr_t) i0 + input_width_increment_single);
				368	i1 = (const float*) ((uintptr_t) i1 + input_width_increment_single);
				369	i2 = (const float*) ((uintptr_t) i2 + input_width_increment_single);
				370	i3 = (const float*) ((uintptr_t) i3 + input_width_increment_single);
				371	i4 = (const float*) ((uintptr_t) i4 + input_width_increment_single);
				372	output0 = (float*) ((uintptr_t) output0 + output_width_increment_single);
				373	m -= 1;
				374	} while (m > 0);
				375	}