Blame - src/f32-dwconv-spchw/3x3p1-scalar.c - platform/external/XNNPACK

blob: d5d69e18527afbb01ed18f29791bdcf35c3150f0 [file] [log] [blame]

Erich Elsen	0cc2c53	2019-10-15 04:44:18 -0700	[diff] [blame]	1	// Copyright 2019 Google LLC
				2	//
				3	// This source code is licensed under the BSD-style license found in the
				4	// LICENSE file in the root directory of this source tree.
				5
				6	#include <assert.h>
				7
				8	#include <xnnpack/dwconv.h>
				9	#include <xnnpack/math.h>
				10
				11
				12	void xnn_f32_dwconv_spchw_ukernel_3x3p1__scalar(
				13	size_t m,
				14	size_t n,
				15	const float* input,
				16	const float* weights,
				17	float* output,
				18	size_t input_tuple_stride,
				19	size_t output_tuple_stride,
				20	size_t input_width_stride,
				21	size_t output_width_stride,
Marat Dukhan	f196d01	2020-04-15 11:50:03 -0700	[diff] [blame^]	22	const union xnn_f32_spchw_params params[restrict XNN_MIN_ELEMENTS(1)])
Erich Elsen	0cc2c53	2019-10-15 04:44:18 -0700	[diff] [blame]	23	{
				24	assert(n != 0);
				25
				26	const size_t input_width_increment = input_width_stride - n * input_tuple_stride;
				27	const size_t output_width_increment = output_width_stride - (n - 1) * output_tuple_stride;
				28
				29	const float params_min = params->scalar.min;
				30	const float params_max = params->scalar.max;
				31
				32	// No vertical padding.
				33	const float* i0 = input;
				34	const float* i1 = (const float*) ((uintptr_t) i0 + input_width_stride);
				35	const float* i2 = (const float*) ((uintptr_t) i1 + input_width_stride);
				36
				37	float* output0 = output;
				38
				39	const float vw0 = weights[0];
				40	const float vw1 = weights[1];
				41	const float vw2 = weights[2];
				42	const float vw3 = weights[3];
				43	const float vw4 = weights[4];
				44	const float vw5 = weights[5];
				45	const float vw6 = weights[6];
				46	const float vw7 = weights[7];
				47	const float vw8 = weights[8];
				48	const float vw9 = weights[9];
				49
				50	while (m > 0) {
				51	float vi0x0 = 0.0f;
				52	float vi1x0 = 0.0f;
				53	float vi2x0 = 0.0f;
				54	float vi0x1 = i0; i0 = (const float) ((uintptr_t) i0 + input_tuple_stride);
				55	float vi1x1 = i1; i1 = (const float) ((uintptr_t) i1 + input_tuple_stride);
				56	float vi2x1 = i2; i2 = (const float) ((uintptr_t) i2 + input_tuple_stride);
				57
				58	size_t k = n;
				59	for (; k > 1; k--) {
				60	const float vi0x2 = i0; i0 = (const float) ((uintptr_t) i0 + input_tuple_stride);
				61	const float vi1x2 = i1; i1 = (const float) ((uintptr_t) i1 + input_tuple_stride);
				62	const float vi2x2 = i2; i2 = (const float) ((uintptr_t) i2 + input_tuple_stride);
				63
				64	const float vrow0_accum = vw1 * vi0x0 + vw2 * vi0x1 + vw3 * vi0x2;
				65	vi0x0 = vi0x1;
				66	vi0x1 = vi0x2;
				67	const float vrow1_accum = vw4 * vi1x0 + vw5 * vi1x1 + vw6 * vi1x2;
				68	vi1x0 = vi1x1;
				69	vi1x1 = vi1x2;
				70	const float vrow2_accum = vw7 * vi2x0 + vw8 * vi2x1 + vw9 * vi2x2;
				71	vi2x0 = vi2x1;
				72	vi2x1 = vi2x2;
				73
				74	float voutput = (vw0 + vrow0_accum) + (vrow1_accum + vrow2_accum);
				75
				76	voutput = math_max_f32(voutput, params_min);
				77	voutput = math_min_f32(voutput, params_max);
				78
				79	output0 = voutput; output0 = (float ) ((uintptr_t) output0 + output_tuple_stride);
				80	}
				81	// Always process the last pixel separately to account for right edge.
				82	assert(k == 1);
				83	{
				84	const float vrow0_accum = vw1 * vi0x0 + vw2 * vi0x1;
				85	const float vrow1_accum = vw4 * vi1x0 + vw5 * vi1x1;
				86	const float vrow2_accum = vw7 * vi2x0 + vw8 * vi2x1;
				87
				88	float voutput = (vw0 + vrow0_accum) + (vrow1_accum + vrow2_accum);
				89
				90	voutput = math_max_f32(voutput, params_min);
				91	voutput = math_min_f32(voutput, params_max);
				92
				93	*output0 = voutput;
				94	}
				95
				96	i0 = (const float*) ((uintptr_t) i0 + input_width_increment);
				97	i1 = (const float*) ((uintptr_t) i1 + input_width_increment);
				98	i2 = (const float*) ((uintptr_t) i2 + input_width_increment);
				99	output0 = (float*) ((uintptr_t) output0 + output_width_increment);
				100	m--;
				101	}
				102	}