Blame - src/f32-dwconv2d-chw/gen/3x3p1-minmax-neon-2x4.c - platform/external/XNNPACK

blob: 8fb484222261b4c445cb9cedaa84647939f8b86c [file] [log] [blame]

Marat Dukhan	c581e48	2020-10-24 01:28:11 -0700	[diff] [blame]	1	// Auto-generated file. Do not edit!
				2	// Template: src/f32-dwconv2d-chw/3x3p1-neon.c.in
				3	// Generator: tools/xngen
				4	//
				5	// Copyright 2020 Google LLC
				6	//
				7	// This source code is licensed under the BSD-style license found in the
				8	// LICENSE file in the root directory of this source tree.
				9
				10	#include <assert.h>
				11
				12	#include <arm_neon.h>
				13
				14	#include <xnnpack/dwconv.h>
				15	#include <xnnpack/math.h>
				16
				17
				18	void xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4(
				19	size_t input_height,
				20	size_t input_width,
				21	const float* input,
				22	const float* weights,
				23	const float* zero,
				24	float* output,
				25	uint32_t padding_top,
				26	const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
				27	{
				28	assert(input_height != 0);
				29	assert(input_width != 0);
				30	assert(input_width % sizeof(float) == 0);
				31	assert(padding_top == 1);
				32
				33	const uint32x4_t vmask = vld1q_u32(params->neon.mask);
				34	const float32x4_t vmax = vld1q_dup_f32(&params->neon.max);
				35	const float32x4_t vmin = vld1q_dup_f32(&params->neon.min);
				36
				37	const float32x4_t vw0123 = vld1q_f32(weights);
				38	const float32x4_t vw4567 = vld1q_f32(weights + 4);
				39	const float32x2_t vw89 = vld1_f32(weights + 8);
				40
				41	const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
				42
				43	const float* i0 = zero;
				44	const float* i1 = input;
				45	const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
				46	const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
				47
				48	float* o0 = output;
				49	float* o1 = (float*) ((uintptr_t) o0 + input_width);
				50
				51	size_t output_height = input_height;
				52	do {
				53	if XNN_UNPREDICTABLE(output_height < 2) {
				54	i2 = zero;
				55	o1 = o0;
				56	}
				57	if XNN_UNPREDICTABLE(output_height < 3) {
				58	i3 = zero;
				59	}
				60
				61	float32x4_t vi0x0123 = vmovq_n_f32(0.0f);
				62	float32x4_t vi1x0123 = vmovq_n_f32(0.0f);
				63	float32x4_t vi2x0123 = vmovq_n_f32(0.0f);
				64	float32x4_t vi3x0123 = vmovq_n_f32(0.0f);
				65
				66	float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
				67	float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
				68	float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
				69	float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4;
				70
				71	size_t w = input_width;
				72	for (; w > 4 * sizeof(float); w -= 4 * sizeof(float)) {
				73	float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
				74	float32x4_t vo1p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
				75
				76	const float32x4_t vi0x89AB = vld1q_f32(i0); i0 += 4;
				77	const float32x4_t vi1x89AB = vld1q_f32(i1); i1 += 4;
				78	const float32x4_t vi2x89AB = vld1q_f32(i2); i2 += 4;
				79	const float32x4_t vi3x89AB = vld1q_f32(i3); i3 += 4;
				80
				81	vo0p0 = vmlaq_lane_f32(vo0p0, vi0x4567, vget_high_f32(vw0123), 0);
				82	vo1p0 = vmlaq_lane_f32(vo1p0, vi1x4567, vget_high_f32(vw0123), 0);
				83
				84	vo0p0 = vmlaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw4567), 1);
				85	vo1p0 = vmlaq_lane_f32(vo1p0, vi2x4567, vget_low_f32(vw4567), 1);
				86
				87	vo0p0 = vmlaq_lane_f32(vo0p0, vi2x4567, vw89, 0);
				88	vo1p0 = vmlaq_lane_f32(vo1p0, vi3x4567, vw89, 0);
				89
				90	const float32x4_t vi0x3456 = vextq_f32(vi0x0123, vi0x4567, 3);
				91	const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3);
				92	const float32x4_t vi2x3456 = vextq_f32(vi2x0123, vi2x4567, 3);
				93	const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3);
				94
				95	vo0p0 = vmlaq_lane_f32(vo0p0, vi0x3456, vget_low_f32(vw0123), 1);
				96	vo1p0 = vmlaq_lane_f32(vo1p0, vi1x3456, vget_low_f32(vw0123), 1);
				97
				98	vo0p0 = vmlaq_lane_f32(vo0p0, vi1x3456, vget_low_f32(vw4567), 0);
				99	vo1p0 = vmlaq_lane_f32(vo1p0, vi2x3456, vget_low_f32(vw4567), 0);
				100
				101	vo0p0 = vmlaq_lane_f32(vo0p0, vi2x3456, vget_high_f32(vw4567), 1);
				102	vo1p0 = vmlaq_lane_f32(vo1p0, vi3x3456, vget_high_f32(vw4567), 1);
				103
				104	vi0x0123 = vi0x4567;
				105	vi1x0123 = vi1x4567;
				106	vi2x0123 = vi2x4567;
				107	vi3x0123 = vi3x4567;
				108
				109	const float32x4_t vi0x5678 = vextq_f32(vi0x4567, vi0x89AB, 1);
				110	const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1);
				111	const float32x4_t vi2x5678 = vextq_f32(vi2x4567, vi2x89AB, 1);
				112	const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1);
				113
				114	vo0p0 = vmlaq_lane_f32(vo0p0, vi0x5678, vget_high_f32(vw0123), 1);
				115	vo1p0 = vmlaq_lane_f32(vo1p0, vi1x5678, vget_high_f32(vw0123), 1);
				116
				117	vo0p0 = vmlaq_lane_f32(vo0p0, vi1x5678, vget_high_f32(vw4567), 0);
				118	vo1p0 = vmlaq_lane_f32(vo1p0, vi2x5678, vget_high_f32(vw4567), 0);
				119
				120	vo0p0 = vmlaq_lane_f32(vo0p0, vi2x5678, vw89, 1);
				121	vo1p0 = vmlaq_lane_f32(vo1p0, vi3x5678, vw89, 1);
				122
				123	vi0x4567 = vi0x89AB;
				124	vi1x4567 = vi1x89AB;
				125	vi2x4567 = vi2x89AB;
				126	vi3x4567 = vi3x89AB;
				127
				128
				129	float32x4_t vo0 = vmaxq_f32(vo0p0, vmin);
				130	float32x4_t vo1 = vmaxq_f32(vo1p0, vmin);
				131
				132	vo0 = vminq_f32(vo0, vmax);
				133	vo1 = vminq_f32(vo1, vmax);
				134
				135	vst1q_f32(o1, vo1); o1 += 4;
				136	vst1q_f32(o0, vo0); o0 += 4;
				137	}
				138	// Always process the last block of 1..4 pixels.
				139	assert(w >= 1 * sizeof(float));
				140	assert(w <= 4 * sizeof(float));
				141	{
				142	float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
				143	float32x4_t vo1p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
				144
				145	vi0x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi0x4567)));
				146	vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567)));
				147	vi2x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi2x4567)));
				148	vi3x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi3x4567)));
				149
				150	vo0p0 = vmlaq_lane_f32(vo0p0, vi0x4567, vget_high_f32(vw0123), 0);
				151	vo1p0 = vmlaq_lane_f32(vo1p0, vi1x4567, vget_high_f32(vw0123), 0);
				152
				153	vo0p0 = vmlaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw4567), 1);
				154	vo1p0 = vmlaq_lane_f32(vo1p0, vi2x4567, vget_low_f32(vw4567), 1);
				155
				156	vo0p0 = vmlaq_lane_f32(vo0p0, vi2x4567, vw89, 0);
				157	vo1p0 = vmlaq_lane_f32(vo1p0, vi3x4567, vw89, 0);
				158
				159	const float32x4_t vi0x3456 = vextq_f32(vi0x0123, vi0x4567, 3);
				160	const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3);
				161	const float32x4_t vi2x3456 = vextq_f32(vi2x0123, vi2x4567, 3);
				162	const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3);
				163
				164	vo0p0 = vmlaq_lane_f32(vo0p0, vi0x3456, vget_low_f32(vw0123), 1);
				165	vo1p0 = vmlaq_lane_f32(vo1p0, vi1x3456, vget_low_f32(vw0123), 1);
				166
				167	vo0p0 = vmlaq_lane_f32(vo0p0, vi1x3456, vget_low_f32(vw4567), 0);
				168	vo1p0 = vmlaq_lane_f32(vo1p0, vi2x3456, vget_low_f32(vw4567), 0);
				169
				170	vo0p0 = vmlaq_lane_f32(vo0p0, vi2x3456, vget_high_f32(vw4567), 1);
				171	vo1p0 = vmlaq_lane_f32(vo1p0, vi3x3456, vget_high_f32(vw4567), 1);
				172
				173	const float32x4_t vzero = vmovq_n_f32(0.0f);
				174	const float32x4_t vi0x5678 = vextq_f32(vi0x4567, vzero, 1);
				175	const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1);
				176	const float32x4_t vi2x5678 = vextq_f32(vi2x4567, vzero, 1);
				177	const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vzero, 1);
				178
				179	vo0p0 = vmlaq_lane_f32(vo0p0, vi0x5678, vget_high_f32(vw0123), 1);
				180	vo1p0 = vmlaq_lane_f32(vo1p0, vi1x5678, vget_high_f32(vw0123), 1);
				181
				182	vo0p0 = vmlaq_lane_f32(vo0p0, vi1x5678, vget_high_f32(vw4567), 0);
				183	vo1p0 = vmlaq_lane_f32(vo1p0, vi2x5678, vget_high_f32(vw4567), 0);
				184
				185	vo0p0 = vmlaq_lane_f32(vo0p0, vi2x5678, vw89, 1);
				186	vo1p0 = vmlaq_lane_f32(vo1p0, vi3x5678, vw89, 1);
				187
				188
				189	float32x4_t vo0 = vmaxq_f32(vo0p0, vmin);
				190	float32x4_t vo1 = vmaxq_f32(vo1p0, vmin);
				191
				192	vo0 = vminq_f32(vo0, vmax);
				193	vo1 = vminq_f32(vo1, vmax);
				194
				195	if XNN_LIKELY(w == 4 * sizeof(float)) {
				196	vst1q_f32(o1, vo1); o1 += 4;
				197	vst1q_f32(o0, vo0); o0 += 4;
				198	} else {
				199	float32x2_t vo0_lo = vget_low_f32(vo0);
				200	float32x2_t vo1_lo = vget_low_f32(vo1);
				201	if (w & (2 * sizeof(float))) {
				202	vst1_f32(o1, vo1_lo); o1 += 2;
				203	vst1_f32(o0, vo0_lo); o0 += 2;
				204
				205	vo0_lo = vget_high_f32(vo0);
				206	vo1_lo = vget_high_f32(vo1);
				207	}
				208	if (w & (1 * sizeof(float))) {
				209	vst1_lane_f32(o1, vo1_lo, 0); o1 += 1;
				210	vst1_lane_f32(o0, vo0_lo, 0); o0 += 1;
				211	}
				212	}
				213	}
				214
				215	i0 = (const float*) ((uintptr_t) i2 - input_decrement);
				216	i1 = (const float*) ((uintptr_t) i3 - input_decrement);
				217	i2 = (const float*) ((uintptr_t) i1 + input_width);
				218	i3 = (const float*) ((uintptr_t) i2 + input_width);
				219
				220	o0 = o1;
				221	o1 = (float*) ((uintptr_t) o0 + input_width);
				222
				223	output_height = doz(output_height, 2);
				224	} while (output_height != 0);
				225	}