Blame - src/f32-dwconv-spchw/5x5s2p2-neonfma.c - platform/external/XNNPACK

blob: 73d8abff50afda51d4fa1588b984794a93e6bc12 [file] [log] [blame]

XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	1	// Copyright 2019 Google LLC
				2	//
				3	// This source code is licensed under the BSD-style license found in the
				4	// LICENSE file in the root directory of this source tree.
				5
				6	#include <assert.h>
				7
				8	#include <arm_neon.h>
				9
				10	#include <xnnpack/dwconv.h>
				11	#include <xnnpack/math.h>
				12
				13
				14	void xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma(
				15	size_t m,
				16	size_t n,
				17	const float* input,
				18	const float* weights,
				19	float* output,
				20	size_t input_tuple_stride,
				21	size_t output_tuple_stride,
				22	size_t input_width_stride,
				23	size_t output_width_stride,
				24	const union xnn_f32_spchw_params params[restrict static 1])
				25	{
				26	assert(n != 0);
				27
				28	const uint32x4_t vmask_even = vld1q_u32(params->neon.mask_even);
				29	const uint32x4_t vmask_odd = vld1q_u32(params->neon.mask_odd);
Frank Barchard	fcfdc0e	2019-10-21 15:58:42 -0700	[diff] [blame^]	30	const float32x4_t vmax = vld1q_dup_f32(&params->neon.max);
				31	const float32x4_t vmin = vld1q_dup_f32(&params->neon.min);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	32
				33	const size_t input_width_increment_single = input_width_stride * 2 - input_tuple_stride * ( (n - 1) / 4 + 1);
				34	const size_t output_width_increment_single = output_width_stride - (n + 1) / 8 * output_tuple_stride;
				35
Marat Dukhan	80fc932	2019-09-29 21:06:36 -0700	[diff] [blame]	36	// No vertical padding.
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	37	const float* i0 = input;
				38	const float* i1 = (const float*) ((uintptr_t) i0 + input_width_stride);
				39	const float* i2 = (const float*) ((uintptr_t) i1 + input_width_stride);
				40	const float* i3 = (const float*) ((uintptr_t) i2 + input_width_stride);
				41	const float* i4 = (const float*) ((uintptr_t) i3 + input_width_stride);
				42
				43	float* output0 = output;
				44
				45	const float32x4_t vw0123 = vld1q_f32(weights);
				46	const float32x4_t vw4567 = vld1q_f32(weights + 4);
				47	const float32x4_t vw89AB = vld1q_f32(weights + 8);
				48	const float32x4_t vwCDEF = vld1q_f32(weights + 12);
				49	const float32x4_t vwGHIJ = vld1q_f32(weights + 16);
				50	const float32x4_t vwKLMN = vld1q_f32(weights + 20);
				51	const float32x2_t vwOP = vld1_f32( weights + 24);
				52
				53	do {
				54	float32x4_t vi0x0123 = vmovq_n_f32(0.0f);
				55	float32x4_t vi1x0123 = vmovq_n_f32(0.0f);
				56	float32x4_t vi2x0123 = vmovq_n_f32(0.0f);
				57	float32x4_t vi3x0123 = vmovq_n_f32(0.0f);
				58	float32x4_t vi4x0123 = vmovq_n_f32(0.0f);
				59	float32x4_t vi0x4567 = vld1q_f32(i0); i0 = (const float*) ((uintptr_t) i0 + input_tuple_stride);
				60	float32x4_t vi1x4567 = vld1q_f32(i1); i1 = (const float*) ((uintptr_t) i1 + input_tuple_stride);
				61	float32x4_t vi2x4567 = vld1q_f32(i2); i2 = (const float*) ((uintptr_t) i2 + input_tuple_stride);
				62	float32x4_t vi3x4567 = vld1q_f32(i3); i3 = (const float*) ((uintptr_t) i3 + input_tuple_stride);
				63	float32x4_t vi4x4567 = vld1q_f32(i4); i4 = (const float*) ((uintptr_t) i4 + input_tuple_stride);
				64
				65	long long k = n;
				66	for (; k > 0; k -= 8) {
				67	float32x4_t vo468Ap00 = vdupq_laneq_f32(vw0123, 0);
				68
				69	float32x4_t vi0x89AB;
				70	float32x4_t vi1x89AB;
				71	float32x4_t vi2x89AB;
				72	float32x4_t vi3x89AB;
				73	float32x4_t vi4x89AB;
				74
				75	if XNN_LIKELY(k > 4) {
				76	vi0x89AB = vld1q_f32(i0); i0 = (const float*) ((uintptr_t) i0 + input_tuple_stride);
				77	vi1x89AB = vld1q_f32(i1); i1 = (const float*) ((uintptr_t) i1 + input_tuple_stride);
				78	vi2x89AB = vld1q_f32(i2); i2 = (const float*) ((uintptr_t) i2 + input_tuple_stride);
				79	vi3x89AB = vld1q_f32(i3); i3 = (const float*) ((uintptr_t) i3 + input_tuple_stride);
				80	vi4x89AB = vld1q_f32(i4); i4 = (const float*) ((uintptr_t) i4 + input_tuple_stride);
				81	} else {
				82	vi0x89AB = vmovq_n_f32(0.f);
				83	vi1x89AB = vmovq_n_f32(0.f);
				84	vi2x89AB = vmovq_n_f32(0.f);
				85	vi3x89AB = vmovq_n_f32(0.f);
				86	vi4x89AB = vmovq_n_f32(0.f);
				87	}
				88
				89	float32x4_t vi0xCDEF;
				90	float32x4_t vi1xCDEF;
				91	float32x4_t vi2xCDEF;
				92	float32x4_t vi3xCDEF;
				93	float32x4_t vi4xCDEF;
				94
				95	if XNN_LIKELY(k > 8) {
				96	vi0xCDEF = vld1q_f32(i0); i0 = (const float*) ((uintptr_t) i0 + input_tuple_stride);
				97	vi1xCDEF = vld1q_f32(i1); i1 = (const float*) ((uintptr_t) i1 + input_tuple_stride);
				98	vi2xCDEF = vld1q_f32(i2); i2 = (const float*) ((uintptr_t) i2 + input_tuple_stride);
				99	vi3xCDEF = vld1q_f32(i3); i3 = (const float*) ((uintptr_t) i3 + input_tuple_stride);
				100	vi4xCDEF = vld1q_f32(i4); i4 = (const float*) ((uintptr_t) i4 + input_tuple_stride);
				101	} else {
				102	vi0xCDEF = vmovq_n_f32(0.f);
				103	vi1xCDEF = vmovq_n_f32(0.f);
				104	vi2xCDEF = vmovq_n_f32(0.f);
				105	vi3xCDEF = vmovq_n_f32(0.f);
				106	vi4xCDEF = vmovq_n_f32(0.f);
				107	}
				108	float32x4_t vi0x468A = vuzp1q_f32(vi0x4567, vi0x89AB);
				109	float32x4_t vi0x579B = vuzp2q_f32(vi0x4567, vi0x89AB);
				110	float32x4_t vi1x468A = vuzp1q_f32(vi1x4567, vi1x89AB);
				111	float32x4_t vi1x579B = vuzp2q_f32(vi1x4567, vi1x89AB);
				112	float32x4_t vi2x468A = vuzp1q_f32(vi2x4567, vi2x89AB);
				113	float32x4_t vi2x579B = vuzp2q_f32(vi2x4567, vi2x89AB);
				114	float32x4_t vi3x468A = vuzp1q_f32(vi3x4567, vi3x89AB);
				115	float32x4_t vi3x579B = vuzp2q_f32(vi3x4567, vi3x89AB);
				116	float32x4_t vi4x468A = vuzp1q_f32(vi4x4567, vi4x89AB);
				117	float32x4_t vi4x579B = vuzp2q_f32(vi4x4567, vi4x89AB);
				118
				119	if XNN_UNLIKELY(k <= 8) {
				120	vi0x468A = vreinterpretq_u32_f32(vandq_u32(vmask_even, vreinterpretq_f32_u32(vi0x468A)));
				121	vi1x468A = vreinterpretq_u32_f32(vandq_u32(vmask_even, vreinterpretq_f32_u32(vi1x468A)));
				122	vi2x468A = vreinterpretq_u32_f32(vandq_u32(vmask_even, vreinterpretq_f32_u32(vi2x468A)));
				123	vi3x468A = vreinterpretq_u32_f32(vandq_u32(vmask_even, vreinterpretq_f32_u32(vi3x468A)));
				124	vi4x468A = vreinterpretq_u32_f32(vandq_u32(vmask_even, vreinterpretq_f32_u32(vi4x468A)));
				125
				126	vi0x579B = vreinterpretq_u32_f32(vandq_u32(vmask_odd, vreinterpretq_f32_u32(vi0x579B)));
				127	vi1x579B = vreinterpretq_u32_f32(vandq_u32(vmask_odd, vreinterpretq_f32_u32(vi1x579B)));
				128	vi2x579B = vreinterpretq_u32_f32(vandq_u32(vmask_odd, vreinterpretq_f32_u32(vi2x579B)));
				129	vi3x579B = vreinterpretq_u32_f32(vandq_u32(vmask_odd, vreinterpretq_f32_u32(vi3x579B)));
				130	vi4x579B = vreinterpretq_u32_f32(vandq_u32(vmask_odd, vreinterpretq_f32_u32(vi4x579B)));
				131	}
				132
				133	// middle tap
				134	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x468A, vw0123, 3);
				135	float32x4_t vo468Ap01 = vmulq_laneq_f32(vi1x468A, vw89AB, 0);
				136	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x468A, vwCDEF, 1);
				137	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x468A, vwGHIJ, 2);
				138	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi4x468A, vwKLMN, 3);
				139
				140	// one left
				141	const float32x4_t vi0x3579 = vextq_f32(vi0x0123, vi0x579B, 3);
				142	const float32x4_t vi1x3579 = vextq_f32(vi1x0123, vi1x579B, 3);
				143	const float32x4_t vi2x3579 = vextq_f32(vi2x0123, vi2x579B, 3);
				144	const float32x4_t vi3x3579 = vextq_f32(vi3x0123, vi3x579B, 3);
				145	const float32x4_t vi4x3579 = vextq_f32(vi4x0123, vi4x579B, 3);
				146
				147	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x3579, vw0123, 2);
				148	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi1x3579, vw4567, 3);
				149	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x3579, vwCDEF, 0);
				150	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x3579, vwGHIJ, 1);
				151	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi4x3579, vwKLMN, 2);
				152
				153	// two left
				154	// getting the vector to use for the far left tap is annoying
				155	// as we can't ext anything we currently have to get it.
				156	// To do this, we get a bit ugly. Interpret the float 32x4
				157	// vector as int 64x2. Then left shift by 32. Interpret
				158	// again as float 32x4. Now the right most bits are what we
				159	// want them to be for the following ext.
				160	const float32x4_t vi0x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi0x0123), 32));
				161	const float32x4_t vi1x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi1x0123), 32));
				162	const float32x4_t vi2x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi2x0123), 32));
				163	const float32x4_t vi3x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi3x0123), 32));
				164	const float32x4_t vi4x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi4x0123), 32));
				165
				166	const float32x4_t vi0x2468 = vextq_f32(vi0x0012, vi0x468A, 3);
				167	const float32x4_t vi1x2468 = vextq_f32(vi1x0012, vi1x468A, 3);
				168	const float32x4_t vi2x2468 = vextq_f32(vi2x0012, vi2x468A, 3);
				169	const float32x4_t vi3x2468 = vextq_f32(vi3x0012, vi3x468A, 3);
				170	const float32x4_t vi4x2468 = vextq_f32(vi4x0012, vi4x468A, 3);
				171
				172	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x2468, vw0123, 1);
				173	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi1x2468, vw4567, 2);
				174	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x2468, vw89AB, 3);
				175	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x2468, vwGHIJ, 0);
				176	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi4x2468, vwKLMN, 1);
				177
				178	vi0x0123 = vi0x89AB;
				179	vi1x0123 = vi1x89AB;
				180	vi2x0123 = vi2x89AB;
				181	vi3x0123 = vi3x89AB;
				182	vi4x0123 = vi4x89AB;
				183
				184	// one right
				185	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x579B, vw4567, 0);
				186	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi1x579B, vw89AB, 1);
				187	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x579B, vwCDEF, 2);
				188	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x579B, vwGHIJ, 3);
				189	vo468Ap00 = vfmaq_lane_f32( vo468Ap00, vi4x579B, vwOP, 0);
				190
				191	// two right
				192	const float32x4_t vi0x68AC = vextq_f32(vi0x468A, vi0xCDEF, 1);
				193	const float32x4_t vi1x68AC = vextq_f32(vi1x468A, vi1xCDEF, 1);
				194	const float32x4_t vi2x68AC = vextq_f32(vi2x468A, vi2xCDEF, 1);
				195	const float32x4_t vi3x68AC = vextq_f32(vi3x468A, vi3xCDEF, 1);
				196	const float32x4_t vi4x68AC = vextq_f32(vi4x468A, vi4xCDEF, 1);
				197
				198	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x68AC, vw4567, 1);
				199	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi1x68AC, vw89AB, 2);
				200	vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x68AC, vwCDEF, 3);
				201	vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x68AC, vwKLMN, 0);
				202	vo468Ap00 = vfmaq_lane_f32( vo468Ap00, vi4x68AC, vwOP, 1);
				203
				204	vi0x4567 = vi0xCDEF;
				205	vi1x4567 = vi1xCDEF;
				206	vi2x4567 = vi2xCDEF;
				207	vi3x4567 = vi3xCDEF;
				208	vi4x4567 = vi4xCDEF;
				209
				210	float32x4_t vo0 = vaddq_f32(vo468Ap00, vo468Ap01);
				211
Frank Barchard	fcfdc0e	2019-10-21 15:58:42 -0700	[diff] [blame^]	212	vo0 = vmaxq_f32(vo0, vmin);
				213	vo0 = vminq_f32(vo0, vmax);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	214
				215	size_t k_tmp = (k + 1) / 2;
				216	if XNN_LIKELY(k_tmp >= 4) {
				217	vst1q_f32(output0, vo0);
				218	output0 = (float*) ((uintptr_t) output0 + output_tuple_stride);
				219	} else {
				220	float* output0_lo = output0;
				221	float32x2_t vo0_lo = vget_low_f32(vo0);
				222	if (k_tmp & 2) {
				223	vst1_f32(output0_lo, vo0_lo); output0_lo += 2;
				224	vo0_lo = vget_high_f32(vo0);
				225	}
				226	if (k_tmp & 1) {
				227	vst1_lane_f32(output0_lo, vo0_lo, 0);
				228	}
				229	}
				230	}
				231
				232	i0 = (const float*) ((uintptr_t) i0 + input_width_increment_single);
				233	i1 = (const float*) ((uintptr_t) i1 + input_width_increment_single);
				234	i2 = (const float*) ((uintptr_t) i2 + input_width_increment_single);
				235	i3 = (const float*) ((uintptr_t) i3 + input_width_increment_single);
				236	i4 = (const float*) ((uintptr_t) i4 + input_width_increment_single);
				237	output0 = (float*) ((uintptr_t) output0 + output_width_increment_single);
				238	m -= 1;
				239	} while (m > 0);
				240	}