arm_compute v18.08
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 49753ad..a70389a 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -33,17 +33,34 @@
namespace
{
-Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims)
+Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, DataLayout data_layout)
{
Size2D output_tile = Size2D{};
- if(kernel_dims == Size2D(3U, 3U))
+ const unsigned int kernel_max_dim = std::max(kernel_dims.width, kernel_dims.height);
+
+ // Check if the input spatial dimensions are smaller than 4
+ const bool is_input_lt4_nchw = (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW);
+
+ if(kernel_max_dim == 3U)
{
- output_tile = (input_dims.width <= 4 && input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
+ if(kernel_dims == Size2D(3U, 3U))
+ {
+ output_tile = is_input_lt4_nchw ? Size2D(2U, 2U) : Size2D(4U, 4U);
+ }
+ else if(kernel_dims == Size2D(3U, 1U))
+ {
+ output_tile = is_input_lt4_nchw ? Size2D(2U, 1U) : Size2D(4U, 1U);
+ }
+ else
+ {
+ output_tile = is_input_lt4_nchw ? Size2D(1U, 2U) : Size2D(1U, 4U);
+ }
}
- else if(kernel_dims == Size2D(5U, 5U))
+ else if(kernel_max_dim == 5U)
{
- output_tile = Size2D(4U, 4U);
+ output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U,
+ kernel_dims.height == 1 ? 1U : 4U);
}
return output_tile;
@@ -82,7 +99,7 @@
// Input shape, kernel size and output tile
const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
- const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
+ const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, input->info()->data_layout());
// Check if the Winograd configuration requires fast math
if(!enable_fast_math)
@@ -139,7 +156,7 @@
// Input shape, kernel size and output tile
const Size2D input_dims = Size2D(input->tensor_shape()[idx_width], input->tensor_shape()[idx_height]);
const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
- const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
+ const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, input->data_layout());
// Check if the Winograd configuration requires fast math
if(!enable_fast_math)