blob: 8f2c4c43617f4699e6cb6cdff06e879a10fd334a [file] [log] [blame]
Jenkinsb3a371b2018-05-23 11:36:53 +01001/*
2 * Copyright (c) 2017-2018 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
25
26#include "arm_compute/core/Error.h"
27#include "arm_compute/core/Utils.h"
28#include "arm_compute/core/Validate.h"
29#include "arm_compute/core/Validate.h"
30#include "arm_compute/core/utils/misc/ShapeCalculator.h"
31#include "arm_compute/runtime/NEON/AssemblyHelper.h"
32#include "arm_compute/runtime/NEON/NEScheduler.h"
33#include "support/ToolchainSupport.h"
34
35#include "arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
36
37#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
38
39namespace arm_compute
40{
41namespace
42{
43inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input)
44{
45 const DataLayout data_layout = input->info()->data_layout();
46 const int in_width = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
47 const int in_height = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
48 const int in_channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
49 const int in_batches = input->info()->dimension(3);
50
51 return Tensor4DShape({ in_batches, in_height, in_width, in_channels });
52}
53
54Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
55{
56 const DataLayout data_layout = input->data_layout();
57 const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
58 const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
59
60 ARM_COMPUTE_UNUSED(output);
61 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
62 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
63 ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW); // COMPMID-1162
64 ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 3 && weights->dimension(height_idx) != 5, "Only 3 and 5 kernels are supported");
65 ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
66
67 ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides.");
68
69 if(biases != nullptr)
70 {
71 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
72 ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
73 }
74
75 return Status{};
76}
77
78Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims)
79{
80 Size2D output_tile = Size2D{};
81
82 if(kernel_dims == Size2D(3U, 3U))
83 {
84 output_tile = (input_dims.width <= 4 && input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
85 }
86 else if(kernel_dims == Size2D(5U, 5U))
87 {
88 output_tile = Size2D(2U, 2U);
89 }
90
91 return output_tile;
92}
93
94bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size)
95{
96 // Check if we want to configure a Winograd configuration which requires fast math
97 using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
98
99 std::vector<WinogradConfiguration> fast_math_winograd =
100 {
101 WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(5, 5)),
102 WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
103 };
104
105 auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
106 std::pair<int, int>(kernel_size.width, kernel_size.height));
107
108 return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
109}
110} //namespace
111
112NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
113 : _memory_group(std::move(memory_manager)), _arm_gemm(nullptr), _gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr),
114 _activationlayer_function(), _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(),
115 _workspace(), _input(), _weights(), _output(), _reshaped_kernel(false), _is_activationlayer_enabled(false)
116{
117} /* arm_compute */
118
119void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
120 bool enable_fast_math)
121{
122 ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
123 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info));
124
125 // Get indices for the width and height
126 const DataLayout data_layout = input->info()->data_layout();
127 const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
128 const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
129 const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
130
131 const Size2D input_dims = Size2D(input->info()->dimension(width_idx), input->info()->dimension(height_idx));
132 const Size2D kernel_size = Size2D(weights->info()->dimension(width_idx), weights->info()->dimension(height_idx));
133 const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
134
135 // Check if the Winograd configuration requires fast math
136 if(!enable_fast_math)
137 {
138 ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
139 }
140
141 _weights = weights;
142 _input = input;
143 _output = output;
144
145 std::unique_ptr<INEWinogradLayerTransformInputKernel<float>> transform_input_kernel;
146 std::unique_ptr<INEWinogradLayerTransformWeightsKernel<float>> transform_weights_kernel;
147 std::unique_ptr<INEWinogradLayerTransformOutputKernel<float>> transform_output_kernel;
148
149 int n_gemms = 0;
150 int N_BLOCK = 0; // Size of block used by GEMM.
151
152 switch(kernel_size.width)
153 {
154 case 3:
155 {
156 if(input->info()->dimension(width_idx) > 4 && input->info()->dimension(height_idx) > 4)
157 {
158 transform_input_kernel = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>>();
159 transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>>();
160 transform_output_kernel = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>>();
161 n_gemms = NEWinogradLayerBatchedGEMMKernel<float, float, 4, 4, 3, 3>::WinogradBase::N_GEMMS;
162 N_BLOCK = NEWinogradLayerBatchedGEMMKernel<float, float, 4, 4, 3, 3>::WinogradConv::N_BLOCK;
163 }
164 else
165 {
166 transform_input_kernel = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>>();
167 transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>>();
168 transform_output_kernel = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>>();
169 n_gemms = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradBase::N_GEMMS;
170 N_BLOCK = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradConv::N_BLOCK;
171 }
172 break;
173 }
174 case 5:
175 {
176 transform_input_kernel = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>>();
177 transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>>();
178 transform_output_kernel = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>>();
179 n_gemms = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradBase::N_GEMMS;
180 N_BLOCK = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradConv::N_BLOCK;
181 break;
182 }
183 default:
184 {
185 ARM_COMPUTE_ERROR("Not supported.");
186 break;
187 }
188 }
189
190 const PaddingType use_padding_type = (conv_info.pad_left() != 0u) ? PADDING_SAME : PADDING_VALID;
191 const bool use_same_padding = use_padding_type == PADDING_SAME;
192
193 // Get convolved dimensions
194 const int in_channels = input->info()->dimension(channel_idx);
195 const int out_channels = output->info()->dimension(channel_idx);
196
197 const Tensor4DShape in_shape(internal_get_input_shape(input));
198 const size_t data_type_size = input->info()->element_size();
199 // Get the memory required to instantiate a new Winograd operator.
200 constexpr size_t storage_alignment = 64;
201 const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size;
202 _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_size + storage_alignment - 1) }, 1, DataType::U8));
203 _kernel_storage.allocator()->allocate();
204 // Input storage
205 const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size;
206 _input_workspace.allocator()->init(TensorInfo(TensorShape{ (input_storage_size + storage_alignment - 1) }, 1, DataType::U8));
207 _input_workspace.allocator()->allocate();
208
209 // Output storage
210 const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, use_same_padding) * data_type_size;
211 _output_workspace.allocator()->init(TensorInfo(TensorShape{ (output_storage_size + storage_alignment - 1) }, 1, DataType::U8));
212 _output_workspace.allocator()->allocate();
213
214 // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
215 TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
216 _output->info()->dimension(1), _output->info()->dimension(3)),
217 1, _output->info()->data_type());
218 _output_nhwc.allocator()->init(info);
219 _output_nhwc.allocator()->allocate();
220
221 // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
222 _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
223 _weights_hwio.allocator()->allocate();
224
225 // configure the kernel to transform the input tensor from NCHW -> NHWC
226 _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
227 _input_nhwc.allocator()->allocate();
228
229 const KernelShape kernel_shape({ out_channels, static_cast<int>(kernel_size.height), static_cast<int>(kernel_size.width), in_channels });
230
231 // Configure the InputTransform
232 const int input_matrix_stride = transform_input_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
233 transform_input_kernel->configure(reinterpret_cast<float *>(_input_nhwc.buffer()), in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
234 reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_stride);
235
236 // Configure WeightsTransform
237 const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
238 transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast<float *>(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels);
239
240 // Configure OutputTransform
241 //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
242 const int output_matrix_stride = transform_output_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
243 const auto output_shape(transform_output_kernel->get_output_shape(kernel_shape, in_shape, use_padding_type));
244
245 transform_output_kernel->configure(biases, reinterpret_cast<float *>(_output_workspace.buffer()),
246 output_matrix_stride, reinterpret_cast<float *>(_output_nhwc.buffer()),
247 in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
248
249 // Configure GEMM
250 const int tile_rows = iceildiv(output_shape.n_rows, output_tile.height);
251 const int tile_cols = iceildiv(output_shape.n_cols, output_tile.width);
252 const int m = in_shape.n_batches * tile_rows * tile_cols;
253 const int k = in_shape.n_channels;
254 const int n = out_channels;
255 const int input_matrix_row_stride = in_shape.n_channels;
256 const int kernel_matrix_row_stride = roundup(out_channels, N_BLOCK);
257 const int output_matrix_row_stride = kernel_matrix_row_stride;
258 unsigned int num_threads = NEScheduler::get().num_threads();
259
260 _arm_gemm = arm_gemm::gemm<float, float>(NEScheduler::get().cpu_info(), m, n, k, 1, n_gemms, false, false, 1.f, 0.f, num_threads, false);
261 _arm_gemm->set_arrays(reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_row_stride, 0, input_matrix_stride, reinterpret_cast<float *>(_kernel_storage.buffer()),
262 kernel_matrix_row_stride, kernel_matrix_stride, reinterpret_cast<float *>(_output_workspace.buffer()), output_matrix_row_stride, 0, output_matrix_stride);
263
264 auto acl_gemm_wrapper = support::cpp14::make_unique<NEGEMMAssemblyWrapper<arm_gemm::GemmCommon<float, float>>>();
265 acl_gemm_wrapper->configure(_arm_gemm.get());
266 const size_t workspace_size = _arm_gemm->get_working_size();
267
268 // Allocate workspace
269 if(workspace_size > 0)
270 {
271 const unsigned int alignment = 4096;
272 allocate_workspace(workspace_size, _workspace, &_memory_group, alignment, 1);
273 _arm_gemm->set_working_space(reinterpret_cast<float *>(_workspace.buffer()));
274 }
275
276 const unsigned int window_size = _arm_gemm->get_window_size();
277 if(window_size < num_threads)
278 {
279 num_threads = window_size;
280 _arm_gemm->set_nthreads(num_threads);
281 }
282
283 _gemm_kernel = std::move(acl_gemm_wrapper);
284
285 // Reorder the convoluted output to ACL's ordering NCHW
286 _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
287
288 _transform_input_kernel = std::move(transform_input_kernel);
289 _transform_weights_kernel = std::move(transform_weights_kernel);
290 _transform_output_kernel = std::move(transform_output_kernel);
291
292 //Configure Activation Layer
293 _is_activationlayer_enabled = act_info.enabled();
294 if(_is_activationlayer_enabled)
295 {
296 _activationlayer_function.configure(output, nullptr, act_info);
297 }
298}
299
300void NEWinogradConvolutionLayer::run()
301{
302 _memory_group.acquire();
303 if(!_reshaped_kernel)
304 {
305 _reshaped_kernel = true;
306 _permute_weights.run();
307 NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
308 }
309 //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
310 _permute_input.run();
311
312 // Transform input tensor to the winograd domain
313 NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX);
314
315 //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
316 NEScheduler::get().schedule(_gemm_kernel.get(), Window::DimX);
317
318 // Transform output tensor to the spatial domain
319 NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
320
321 // Reorder the convoluted output to ACL's ordering NCHW
322 _permute_output.run();
323
324 if(_is_activationlayer_enabled)
325 {
326 _activationlayer_function.run();
327 }
328 _memory_group.release();
329}
330
331Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
332 const ActivationLayerInfo &act_info, bool enable_fast_math)
333{
334 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
335 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info));
336
337 // Get indices for the width and height
338 const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
339 const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
340
341 // Input shape, kernel size and output tile
342 const Size2D input_dims = Size2D(input->dimension(idx_width), input->dimension(idx_height));
343 const Size2D kernel_size = Size2D(weights->dimension(idx_width), weights->dimension(idx_height));
344 const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
345
346 // Check if the Winograd configuration requires fast math
347 if(!enable_fast_math)
348 {
349 ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
350 }
351
352 const WinogradInfo winograd_info = WinogradInfo(output_tile,
353 kernel_size,
354 input_dims,
355 conv_info,
356 input->data_layout());
357
358 // Validate input transform
359 const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
360 const TensorInfo input0 = input->clone()->set_tensor_shape(input0_shape);
361 switch(weights->dimension(idx_width))
362 {
363 case 3:
364 {
365 if(input_dims.width > 4 && input_dims.height > 4)
366 {
367 ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>::validate(input, &input0, winograd_info)));
368 }
369 else
370 {
371 ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>::validate(input, &input0, winograd_info)));
372 }
373 break;
374 }
375 case 5:
376 {
377 ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>::validate(input, &input0, winograd_info)));
378 break;
379 }
380 default:
381 {
382 ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
383 break;
384 }
385 }
386 // Validate filter transform
387 const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
388 const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape);
389
390 switch(weights->dimension(idx_width))
391 {
392 case 3:
393 {
394 if(input_dims.width > 4 && input_dims.height > 4)
395 {
396 ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>::validate(weights, &input1, winograd_info)));
397 }
398 else
399 {
400 ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>::validate(weights, &input1, winograd_info)));
401 }
402 break;
403 }
404 case 5:
405 {
406 ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>::validate(weights, &input1, winograd_info)));
407 break;
408 }
409 default:
410 {
411 ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
412 break;
413 }
414 }
415 // Validate batched matrix multiply
416 TensorShape batched_mm_output_shape = input0.tensor_shape();
417 batched_mm_output_shape[0] = input1.tensor_shape()[0];
418 const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape);
419 switch(weights->dimension(idx_width))
420 {
421 case 3:
422 {
423 if(input_dims.width > 4 && input_dims.height > 4)
424 {
425 // Validate output transform
426 ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>::validate(&batched_mm_output, biases, output, winograd_info)));
427 }
428 else
429 {
430 // Validate output transform
431 ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>::validate(&batched_mm_output, biases, output, winograd_info)));
432 }
433 break;
434 }
435 case 5:
436 {
437 // Validate output transform
438 ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>::validate(&batched_mm_output, biases, output, winograd_info)));
439 break;
440 }
441 default:
442 {
443 ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
444 break;
445 }
446 }
447
448 // Validate Activation Layer
449 if(act_info.enabled())
450 {
451 NEActivationLayer::validate(output, nullptr, act_info);
452 }
453 return Status{};
454}
455
456} // namespace arm_compute