blob: 31398c76488a5a8182da80f0ec38496e3bea5532 [file] [log] [blame]
Erich Elsenac4de802019-10-16 04:35:30 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <assert.h>
7
Erich Elseneda9c112020-05-11 04:40:25 -07008#include <stdio.h>
9
Erich Elsenac4de802019-10-16 04:35:30 -070010#include <xnnpack/dwconv.h>
11#include <xnnpack/math.h>
12
13
14void xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar(
Erich Elseneda9c112020-05-11 04:40:25 -070015 size_t input_height,
16 size_t input_width,
Erich Elsenac4de802019-10-16 04:35:30 -070017 const float* input,
18 const float* weights,
Erich Elsen4e5db3d2020-05-07 08:57:47 -070019 const float* zero,
Erich Elsenac4de802019-10-16 04:35:30 -070020 float* output,
Erich Elsen4e5db3d2020-05-07 08:57:47 -070021 uint32_t padding_top,
Erich Elsenac4de802019-10-16 04:35:30 -070022 size_t input_tuple_stride,
23 size_t output_tuple_stride,
24 size_t input_width_stride,
25 size_t output_width_stride,
Marat Dukhanf196d012020-04-15 11:50:03 -070026 const union xnn_f32_spchw_params params[restrict XNN_MIN_ELEMENTS(1)])
Erich Elsenac4de802019-10-16 04:35:30 -070027{
Erich Elseneda9c112020-05-11 04:40:25 -070028 assert(input_height!= 0);
29 assert(input_width != 0);
30 assert(padding_top >= 0 && padding_top <= 1);
Erich Elsenac4de802019-10-16 04:35:30 -070031
Erich Elseneda9c112020-05-11 04:40:25 -070032 const size_t padded_input_height = input_height + padding_top + 1 /* padding_bottom */;
33 const size_t output_height = (padded_input_height - 3) / 2 + 1;
34
35 const size_t input_width_decrement_single = (input_width/2) * 2 * input_tuple_stride;;
Erich Elsen4e5db3d2020-05-07 08:57:47 -070036 const size_t input_width_increment = 2 * input_width_stride - input_width_decrement_single;
Erich Elseneda9c112020-05-11 04:40:25 -070037 const size_t output_width_increment = output_width_stride - (input_width/2) * output_tuple_stride;
Erich Elsenac4de802019-10-16 04:35:30 -070038
39 const float params_min = params->scalar.min;
40 const float params_max = params->scalar.max;
41
Erich Elsen4e5db3d2020-05-07 08:57:47 -070042 const float* i0;
43 const float* i1;
44 const float* i2;
45
46 if (padding_top == 0) {
47 i0 = input;
48 i1 = (const float*) ((uintptr_t) i0 + input_width_stride);
49 i2 = (const float*) ((uintptr_t) i1 + input_width_stride);
Erich Elseneda9c112020-05-11 04:40:25 -070050 if (input_height <= 2) {
Erich Elsen4e5db3d2020-05-07 08:57:47 -070051 i2 = zero;
52 }
Erich Elseneda9c112020-05-11 04:40:25 -070053 if (input_height == 1) {
54 i1 = zero;
55 }
Erich Elsen4e5db3d2020-05-07 08:57:47 -070056 } else {
57 i0 = zero;
58 i1 = input;
59 i2 = (const float*) ((uintptr_t) i1 + input_width_stride);
Erich Elseneda9c112020-05-11 04:40:25 -070060 if (input_height == 1) {
Erich Elsen4e5db3d2020-05-07 08:57:47 -070061 i2 = zero;
62 }
63 }
Erich Elsenac4de802019-10-16 04:35:30 -070064
65 float* output0 = output;
66
67 const float vw0 = weights[0];
68 const float vw1 = weights[1];
69 const float vw2 = weights[2];
70 const float vw3 = weights[3];
71 const float vw4 = weights[4];
72 const float vw5 = weights[5];
73 const float vw6 = weights[6];
74 const float vw7 = weights[7];
75 const float vw8 = weights[8];
76 const float vw9 = weights[9];
77
Erich Elseneda9c112020-05-11 04:40:25 -070078 size_t m = output_height;
Erich Elsenac4de802019-10-16 04:35:30 -070079 while (m > 0) {
80 float vi0x0 = 0.0f;
81 float vi1x0 = 0.0f;
82 float vi2x0 = 0.0f;
83
Erich Elseneda9c112020-05-11 04:40:25 -070084 size_t k = input_width;
Erich Elsenac4de802019-10-16 04:35:30 -070085 for (; k >= 2; k -= 2) {
86 const float vi0x1 = *i0; i0 = (const float*) ((uintptr_t) i0 + input_tuple_stride);
87 const float vi1x1 = *i1; i1 = (const float*) ((uintptr_t) i1 + input_tuple_stride);
88 const float vi2x1 = *i2; i2 = (const float*) ((uintptr_t) i2 + input_tuple_stride);
89 const float vi0x2 = *i0; i0 = (const float*) ((uintptr_t) i0 + input_tuple_stride);
90 const float vi1x2 = *i1; i1 = (const float*) ((uintptr_t) i1 + input_tuple_stride);
91 const float vi2x2 = *i2; i2 = (const float*) ((uintptr_t) i2 + input_tuple_stride);
92
93 const float vrow0_accum = vw1 * vi0x0 + vw2 * vi0x1 + vw3 * vi0x2;
94 vi0x0 = vi0x2;
95 const float vrow1_accum = vw4 * vi1x0 + vw5 * vi1x1 + vw6 * vi1x2;
96 vi1x0 = vi1x2;
97 const float vrow2_accum = vw7 * vi2x0 + vw8 * vi2x1 + vw9 * vi2x2;
98 vi2x0 = vi2x2;
99
100 float voutput = (vw0 + vrow0_accum) + (vrow1_accum + vrow2_accum);
101
102 voutput = math_max_f32(voutput, params_min);
103 voutput = math_min_f32(voutput, params_max);
104
105 *output0 = voutput; output0 = (float *) ((uintptr_t) output0 + output_tuple_stride);
106 }
107 // Possibly process the last pixel separately to account for right edge.
108 if (k == 1)
109 {
110 const float vi0x1 = i0[0];
111 const float vi1x1 = i1[0];
112 const float vi2x1 = i2[0];
113 const float vrow0_accum = vw1 * vi0x0 + vw2 * vi0x1;
114 const float vrow1_accum = vw4 * vi1x0 + vw5 * vi1x1;
115 const float vrow2_accum = vw7 * vi2x0 + vw8 * vi2x1;
116
117 float voutput = (vw0 + vrow0_accum) + (vrow1_accum + vrow2_accum);
118
119 voutput = math_max_f32(voutput, params_min);
120 voutput = math_min_f32(voutput, params_max);
121
122 *output0 = voutput;
123 }
124
Erich Elsen4e5db3d2020-05-07 08:57:47 -0700125 i0 = (const float*) ((uintptr_t) i2 - input_width_decrement_single);
Erich Elsenac4de802019-10-16 04:35:30 -0700126 i1 = (const float*) ((uintptr_t) i1 + input_width_increment);
127 i2 = (const float*) ((uintptr_t) i2 + input_width_increment);
128 output0 = (float*) ((uintptr_t) output0 + output_width_increment);
Erich Elsen4e5db3d2020-05-07 08:57:47 -0700129 m -= 1;
Erich Elseneda9c112020-05-11 04:40:25 -0700130 if (m == 1 && padding_top == input_height % 2) {
131 // to mimic the following code with only one if, we do some small
132 // shenanigans...
133 // if (padding_top == 0 && input_height % 2 == 0) {
134 // i2 = zero;
135 // } else if (padding_top == 1 && input_height % 2 == 1) {
136 // i2 = zero;
137 // }
Erich Elsen4e5db3d2020-05-07 08:57:47 -0700138 i2 = zero;
139 }
Erich Elsenac4de802019-10-16 04:35:30 -0700140 }
141}