blob: 08e18dfe412f16083371f21b4b263ba0c1729277 [file] [log] [blame]
Anthony Barbier871448e2017-03-24 14:54:29 +00001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
25
26#include "arm_compute/core/Validate.h"
27#include "arm_compute/runtime/CL/CLScheduler.h"
28
Anthony Barbiera4376382017-04-12 15:12:46 +010029#include <algorithm>
30#include <cmath>
31
Anthony Barbier871448e2017-03-24 14:54:29 +000032using namespace arm_compute;
33
34CLFullyConnectedLayer::CLFullyConnectedLayer()
Anthony Barbiera4376382017-04-12 15:12:46 +010035 : _im2col_kernel(), _transpose_kernel(), _transpose1xW_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _transpose_output(),
36 _transpose1xW_output(), _is_first_run(true), _transpose_weights(true), _fc_after_conv(true), _batched_fc_layer(false), _accumulate_biases(false)
Anthony Barbier871448e2017-03-24 14:54:29 +000037{
38}
39
Anthony Barbiera4376382017-04-12 15:12:46 +010040void CLFullyConnectedLayer::configure_conv_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
Anthony Barbier871448e2017-03-24 14:54:29 +000041{
Anthony Barbiera4376382017-04-12 15:12:46 +010042 ARM_COMPUTE_ERROR_ON(weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
Anthony Barbier871448e2017-03-24 14:54:29 +000043
Anthony Barbiera4376382017-04-12 15:12:46 +010044 // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
Anthony Barbier871448e2017-03-24 14:54:29 +000045
Anthony Barbiera4376382017-04-12 15:12:46 +010046 // Initialize output tensor for im2col
47 TensorShape shape_im2col;
48 shape_im2col.set(0, weights->info()->dimension(1));
49 shape_im2col.set(1, input->info()->dimension(3));
50 shape_im2col.set(2, input->info()->dimension(4));
51 shape_im2col.set(3, input->info()->dimension(5));
52 _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
Anthony Barbier871448e2017-03-24 14:54:29 +000053
Anthony Barbiera4376382017-04-12 15:12:46 +010054 // Initialize output tensor for interleave 4x4
55 TensorShape shape_interleaved = _im2col_output.info()->tensor_shape();
56 shape_interleaved.set(0, shape_interleaved.x() * 4);
57 shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
58 _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
Anthony Barbier871448e2017-03-24 14:54:29 +000059
Anthony Barbiera4376382017-04-12 15:12:46 +010060 // Initialize output tensor for transpose 1xW
61 TensorShape shape_transposed1xW(weights->info()->dimension(1) * 4, static_cast<size_t>(std::ceil(weights->info()->dimension(0) / 4.f)));
62 _transpose1xW_output.allocator()->init(TensorInfo(shape_transposed1xW, 1, weights->info()->data_type()));
63
64 // Configure im2col kernel
65 _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
66
67 // Configure interleave4x4 kernel
68 _interleave4x4_kernel.configure(&_im2col_output, &_interleave4x4_output);
69
70 // Configure transpose 1xW kernel
71 _transpose1xW_kernel.configure(weights, &_transpose1xW_output);
72
73 // Configure matrix multiply kernel
74 _mm_kernel.configure(&_interleave4x4_output, &_transpose1xW_output, output, 1.0f);
75
76 // Allocate the tensors once all the configure methods have been called
77 _im2col_output.allocator()->allocate();
78 _interleave4x4_output.allocator()->allocate();
79 _transpose1xW_output.allocator()->allocate();
80}
81
82void CLFullyConnectedLayer::configure_fc_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
83{
84 // Initialize output tensor for interleave 4x4
85 TensorShape shape_interleaved = input->info()->tensor_shape();
86 shape_interleaved.set(0, shape_interleaved.x() * 4);
87 shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
88 _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
89
90 // Initialize output tensor for transpose 1xW
91 TensorShape shape_transposed1xW(weights->info()->dimension(1) * 4, static_cast<size_t>(std::ceil(weights->info()->dimension(0) / 4.f)));
92 _transpose1xW_output.allocator()->init(TensorInfo(shape_transposed1xW, 1, weights->info()->data_type()));
93
94 // Configure interleave4x4 kernel
95 _interleave4x4_kernel.configure(input, &_interleave4x4_output);
96
97 // Configure transpose 1xW kernel
98 _transpose1xW_kernel.configure(weights, &_transpose1xW_output);
99
100 // Configure matrix multiply kernel
101 _mm_kernel.configure(&_interleave4x4_output, &_transpose1xW_output, output, 1.0f);
102
103 // Allocate the tensors once all the configure methods have been called
104 _interleave4x4_output.allocator()->allocate();
105 _transpose1xW_output.allocator()->allocate();
106}
107
108void CLFullyConnectedLayer::configure_conv_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
109{
110 ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
111
112 // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
113
114 // Initialize output tensor for im2col
115 TensorShape shape_im2col;
116 shape_im2col.set(0, weights->info()->dimension(1));
117 shape_im2col.set(1, 1);
118 _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
119
120 // Configure im2col kernel
121 _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
122
123 // Configure matrix multiply kernel
124 _mm_kernel.configure(&_im2col_output, weights, output, 1.0f);
125
126 // Allocate the output tensor for im2col once all the configure methods have been called
127 _im2col_output.allocator()->allocate();
128}
129
130void CLFullyConnectedLayer::configure_fc_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
131{
132 ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
133
134 // Configure matrix multiply kernel
135 _mm_kernel.configure(input, weights, output, 1.0f);
136}
137
138void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights)
139{
140 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
141 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
142 ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
143 ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
144
145 const ICLTensor *weights_to_use = weights;
146
147 _is_first_run = true;
148 _transpose_weights = transpose_weights;
149 _fc_after_conv = true;
150 _batched_fc_layer = false;
151 _accumulate_biases = false;
152
Anthony Barbier871448e2017-03-24 14:54:29 +0000153 if(biases != nullptr)
154 {
Anthony Barbiera4376382017-04-12 15:12:46 +0100155 ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
156
157 _accumulate_biases = true;
158
159 // Configure accumulate biases kernel
160 _accumulate_biases_kernel.configure(output, biases);
Anthony Barbier871448e2017-03-24 14:54:29 +0000161 }
162
Anthony Barbiera4376382017-04-12 15:12:46 +0100163 // Check if we need to transpose the weights
164 if(_transpose_weights)
Anthony Barbier871448e2017-03-24 14:54:29 +0000165 {
Anthony Barbiera4376382017-04-12 15:12:46 +0100166 // Initialize the output tensor for transpose
167 TensorShape shape_transposed(weights->info()->dimension(1), weights->info()->dimension(0));
168 _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, weights->info()->data_type()));
169 _transpose_kernel.configure(weights, &_transpose_output);
170
171 weights_to_use = &_transpose_output;
Anthony Barbier871448e2017-03-24 14:54:29 +0000172 }
173
Anthony Barbiera4376382017-04-12 15:12:46 +0100174 // With the Fully Connected layer we can have 4 different cases:
175 // 1) Convolution layer -> Fully Connected layer without batches
176 // 2) Fully Connected layer -> Fully Connected layer without batches
177 // 3) Convolution layer -> Fully Connected layer with batches
178 // 4) Fully Connected layer -> Fully Connected layer with batches
Anthony Barbier871448e2017-03-24 14:54:29 +0000179
Anthony Barbiera4376382017-04-12 15:12:46 +0100180 // Check if we have a fully connected layer with batches
181 _batched_fc_layer = (output->info()->dimension(1) > 1);
182
183 if(_batched_fc_layer)
Anthony Barbier871448e2017-03-24 14:54:29 +0000184 {
Anthony Barbiera4376382017-04-12 15:12:46 +0100185 _fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
186 input->info()->tensor_shape().cend(),
187 output->info()->tensor_shape().cbegin() + 1));
188
189 if(_fc_after_conv)
190 {
191 // Fully Connected layer after a Convolution Layer with batches
192 configure_conv_fc_wb(input, weights_to_use, output);
193 }
194 else
195 {
196 // Fully Connected layer after a Fully Connected Layer with batches
197 configure_fc_fc_wb(input, weights_to_use, output);
198 }
199 }
200 else
201 {
202 _fc_after_conv = (weights_to_use->info()->dimension(1) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
203
204 if(_fc_after_conv)
205 {
206 // Fully Connected layer after a Convolution Layer without batches
207 configure_conv_fc_nb(input, weights_to_use, output);
208 }
209 else
210 {
211 // Fully Connected layer after a Fully Connected Layer without batches
212 configure_fc_fc_nb(input, weights_to_use, output);
213 }
214 }
215
216 // Allocate the transpose tensor if the transpose_weights flag is true and once all the configure methods have been called
217 if(_transpose_weights)
218 {
219 _transpose_output.allocator()->allocate();
Anthony Barbier871448e2017-03-24 14:54:29 +0000220 }
221}
222
223void CLFullyConnectedLayer::run()
224{
Anthony Barbiera4376382017-04-12 15:12:46 +0100225 // The reshape of the weights happens only once
226 if(_is_first_run)
227 {
228 _is_first_run = false;
229
230 if(_transpose_weights)
231 {
232 CLScheduler::get().enqueue(_transpose_kernel);
233 }
234
235 if(_batched_fc_layer)
236 {
237 CLScheduler::get().enqueue(_transpose1xW_kernel);
238 }
239 }
240
241 // Linearize input if it comes from a convolutional layer
242 if(_fc_after_conv)
243 {
244 CLScheduler::get().enqueue(_im2col_kernel, false);
245 }
246
247 // Interleave input
248 if(_batched_fc_layer)
249 {
250 CLScheduler::get().enqueue(_interleave4x4_kernel, false);
251 }
252
253 // Run matrix multiply
254 CLScheduler::get().enqueue(_mm_kernel, !_accumulate_biases);
255
256 // Accumulate biases if provided
257 if(_accumulate_biases)
258 {
259 CLScheduler::get().enqueue(_accumulate_biases_kernel);
260 }
Anthony Barbier871448e2017-03-24 14:54:29 +0000261}