Blame - src/f32-dwconv-spchw/3x3s2p1-scalar.c - platform/external/XNNPACK

blob: b525bf6cfe0226f43282aed85db50f93bf926206 [file] [log] [blame]

Erich Elsen	ac4de80	2019-10-16 04:35:30 -0700	[diff] [blame^]	1	// Copyright 2019 Google LLC
				2	//
				3	// This source code is licensed under the BSD-style license found in the
				4	// LICENSE file in the root directory of this source tree.
				5
				6	#include <assert.h>
				7
				8	#include <xnnpack/dwconv.h>
				9	#include <xnnpack/math.h>
				10
				11
				12	void xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar(
				13	size_t m,
				14	size_t n,
				15	const float* input,
				16	const float* weights,
				17	float* output,
				18	size_t input_tuple_stride,
				19	size_t output_tuple_stride,
				20	size_t input_width_stride,
				21	size_t output_width_stride,
				22	const union xnn_f32_spchw_params params[restrict static 1])
				23	{
				24	assert(n != 0);
				25
				26	const size_t input_width_increment = 2 * input_width_stride - (n/2) * 2 * input_tuple_stride;
				27	const size_t output_width_increment = output_width_stride - (n/2) * output_tuple_stride;
				28
				29	const float params_min = params->scalar.min;
				30	const float params_max = params->scalar.max;
				31
				32	// No vertical padding.
				33	const float* i0 = input;
				34	const float* i1 = (const float*) ((uintptr_t) i0 + input_width_stride);
				35	const float* i2 = (const float*) ((uintptr_t) i1 + input_width_stride);
				36
				37	float* output0 = output;
				38
				39	const float vw0 = weights[0];
				40	const float vw1 = weights[1];
				41	const float vw2 = weights[2];
				42	const float vw3 = weights[3];
				43	const float vw4 = weights[4];
				44	const float vw5 = weights[5];
				45	const float vw6 = weights[6];
				46	const float vw7 = weights[7];
				47	const float vw8 = weights[8];
				48	const float vw9 = weights[9];
				49
				50	while (m > 0) {
				51	float vi0x0 = 0.0f;
				52	float vi1x0 = 0.0f;
				53	float vi2x0 = 0.0f;
				54
				55	size_t k = n;
				56	for (; k >= 2; k -= 2) {
				57	const float vi0x1 = i0; i0 = (const float) ((uintptr_t) i0 + input_tuple_stride);
				58	const float vi1x1 = i1; i1 = (const float) ((uintptr_t) i1 + input_tuple_stride);
				59	const float vi2x1 = i2; i2 = (const float) ((uintptr_t) i2 + input_tuple_stride);
				60	const float vi0x2 = i0; i0 = (const float) ((uintptr_t) i0 + input_tuple_stride);
				61	const float vi1x2 = i1; i1 = (const float) ((uintptr_t) i1 + input_tuple_stride);
				62	const float vi2x2 = i2; i2 = (const float) ((uintptr_t) i2 + input_tuple_stride);
				63
				64	const float vrow0_accum = vw1 * vi0x0 + vw2 * vi0x1 + vw3 * vi0x2;
				65	vi0x0 = vi0x2;
				66	const float vrow1_accum = vw4 * vi1x0 + vw5 * vi1x1 + vw6 * vi1x2;
				67	vi1x0 = vi1x2;
				68	const float vrow2_accum = vw7 * vi2x0 + vw8 * vi2x1 + vw9 * vi2x2;
				69	vi2x0 = vi2x2;
				70
				71	float voutput = (vw0 + vrow0_accum) + (vrow1_accum + vrow2_accum);
				72
				73	voutput = math_max_f32(voutput, params_min);
				74	voutput = math_min_f32(voutput, params_max);
				75
				76	output0 = voutput; output0 = (float ) ((uintptr_t) output0 + output_tuple_stride);
				77	}
				78	// Possibly process the last pixel separately to account for right edge.
				79	if (k == 1)
				80	{
				81	const float vi0x1 = i0[0];
				82	const float vi1x1 = i1[0];
				83	const float vi2x1 = i2[0];
				84	const float vrow0_accum = vw1 * vi0x0 + vw2 * vi0x1;
				85	const float vrow1_accum = vw4 * vi1x0 + vw5 * vi1x1;
				86	const float vrow2_accum = vw7 * vi2x0 + vw8 * vi2x1;
				87
				88	float voutput = (vw0 + vrow0_accum) + (vrow1_accum + vrow2_accum);
				89
				90	voutput = math_max_f32(voutput, params_min);
				91	voutput = math_min_f32(voutput, params_max);
				92
				93	*output0 = voutput;
				94	}
				95
				96	i0 = (const float*) ((uintptr_t) i0 + input_width_increment);
				97	i1 = (const float*) ((uintptr_t) i1 + input_width_increment);
				98	i2 = (const float*) ((uintptr_t) i2 + input_width_increment);
				99	output0 = (float*) ((uintptr_t) output0 + output_width_increment);
				100	m--;
				101	}
				102	}