arm_compute v19.11.1
diff --git a/src/core/CL/cl_kernels/elementwise_unary.cl b/src/core/CL/cl_kernels/elementwise_unary.cl
index b496fcf..e8a3fb7 100644
--- a/src/core/CL/cl_kernels/elementwise_unary.cl
+++ b/src/core/CL/cl_kernels/elementwise_unary.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,20 +48,28 @@
 /** Applies element wise unary operator in a tensor.
  *
  * @param[in]  in_ptr                            Pointer to the source image. Supported data types: F16/32.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
  * @param[out] out_ptr                           Pointer to the destination image. Supported data types: F16/32.
  * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_step_y                        Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
  */
 __kernel void elementwise_unary(
-    VECTOR_DECLARATION(in),
-    VECTOR_DECLARATION(out))
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out))
 {
-    Vector in  = CONVERT_TO_VECTOR_STRUCT(in);
-    Vector out = CONVERT_TO_VECTOR_STRUCT(out);
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
 
 #if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
     // Check if access on width gets out of bounds
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index 819e3c9..cd9552f 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,7 +35,7 @@
 using namespace arm_compute;
 
 CLDeconvolutionLayerUpsampleKernel::CLDeconvolutionLayerUpsampleKernel()
-    : _input(nullptr), _output(nullptr), _info()
+    : _input(nullptr), _output(nullptr), _info(), _data_layout(DataLayout::UNKNOWN)
 {
 }
 
@@ -72,13 +72,14 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    _input  = input;
-    _output = output;
-    _info   = info;
-
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayerUpsampleKernel::validate(input->info(), output->info(), info));
 
+    _input       = input;
+    _output      = output;
+    _info        = info;
+    _data_layout = input->info()->data_layout();
+
     // Create kernel
     CLBuildOptions build_opts;
     build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
@@ -99,10 +100,8 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const DataLayout data_layout = _input->info()->data_layout();
-
-    const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
 
     const int out_start_x = _info.pad_left();
     const int out_end_x   = _output->info()->dimension(idx_w) - _info.pad_right() + _info.stride().first - 1;
@@ -112,7 +111,7 @@
     const int out_end_y   = _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1;
     const int out_step_y  = _info.stride().second;
 
-    switch(data_layout)
+    switch(_data_layout)
     {
         case DataLayout::NCHW:
         {
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index 7b74a5a..e7ff762 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -400,7 +400,7 @@
 } // namespace
 
 CLDirectConvolutionLayerKernel::CLDirectConvolutionLayerKernel()
-    : _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _border_size(0), _conv_stride_x(0), _conv_stride_y(0)
+    : _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _border_size(0), _conv_stride_x(0), _conv_stride_y(0)
 {
 }
 
@@ -413,10 +413,10 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
-    const DataLayout data_layout = input->info()->data_layout();
-    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    _data_layout          = input->info()->data_layout();
+    const int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
 
     const unsigned int kernel_size = weights->info()->dimension(width_idx);
     const DataType     data_type   = input->info()->data_type();
@@ -442,11 +442,11 @@
     _conv_stride_x = std::get<0>(conv_info.stride());
     _conv_stride_y = std::get<1>(conv_info.stride());
 
-    if(data_layout == DataLayout::NHWC)
+    if(_data_layout == DataLayout::NHWC)
     {
         _border_size = BorderSize(conv_info.pad_left(), 0, conv_info.pad_right(), 0);
     }
-    else if(data_layout == DataLayout::NCHW)
+    else if(_data_layout == DataLayout::NCHW)
     {
         _border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
     }
@@ -464,15 +464,15 @@
 
     std::stringstream kernel_name;
     kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
-    if(data_layout == DataLayout::NHWC)
+    if(_data_layout == DataLayout::NHWC)
     {
-        kernel_name << "_" << lower_string(string_from_data_layout(data_layout));
+        kernel_name << "_" << lower_string(string_from_data_layout(_data_layout));
     }
 
     CLBuildOptions build_options;
     build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS"));
 
-    const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, data_layout);
+    const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, _data_layout);
 
     if(run_optimized_for_bifrost)
     {
@@ -489,9 +489,9 @@
         build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
         build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx))));
         build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x)));
-        if(data_layout == DataLayout::NHWC)
+        if(_data_layout == DataLayout::NHWC)
         {
-            const bool run_optimized_for_bifrost_nhwc = can_run_optimized_kernel_for_bifrost_nhwc(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, data_layout);
+            const bool run_optimized_for_bifrost_nhwc = can_run_optimized_kernel_for_bifrost_nhwc(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, _data_layout);
             build_options.add_option(std::string("-DDATA_LAYOUT_NHWC=1"));
             build_options.add_option(std::string("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(height_idx))));
             build_options.add_option(std::string("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(width_idx))));
@@ -561,7 +561,7 @@
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(height_idx));
     _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(data_layout));
+    _config_id += lower_string(string_from_data_layout(_data_layout));
 }
 
 Status CLDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
@@ -585,9 +585,8 @@
     win_in.adjust(Window::DimX, -_border_size.left, true);
     win_in.adjust(Window::DimY, -_border_size.top, true);
 
-    const DataLayout data_layout = _input->info()->data_layout();
-    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int width_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
 
     win_in.set_dimension_step(width_idx, window[width_idx].step() * _conv_stride_x);
     win_in.set_dimension_step(height_idx, window[height_idx].step() * _conv_stride_y);
diff --git a/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp b/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp
index c4ab504..543c8f3 100644
--- a/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp
+++ b/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -121,14 +121,15 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimX);
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
 
     do
     {
         unsigned int idx = 0;
-        add_1D_tensor_argument(idx, _input, collapsed);
-        add_1D_tensor_argument(idx, _output, collapsed);
-        enqueue(queue, *this, collapsed, lws_hint());
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice, lws_hint());
     }
-    while(window.slide_window_slice_1D(collapsed));
+    while(collapsed.slide_window_slice_3D(slice));
 }
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 10d6e68..24f22c3 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -287,7 +287,7 @@
 } // namespace
 
 CLIm2ColKernel::CLIm2ColKernel()
-    : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups()
+    : _input(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups()
 {
 }
 
@@ -297,9 +297,10 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups));
 
-    const DataLayout   data_layout  = input->info()->data_layout();
-    const unsigned int width_idx    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    _data_layout = input->info()->data_layout();
+
+    const unsigned int width_idx    = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
     const unsigned int input_width  = input->info()->dimension(width_idx);
     const unsigned int input_height = input->info()->dimension(height_idx);
 
@@ -336,7 +337,7 @@
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
     _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
+    _config_id += lower_string(string_from_data_layout(_data_layout));
 }
 
 Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
@@ -369,7 +370,7 @@
     Window slice_in  = first_slice_3d;
     Window slice_out = window_output.first_slice_window_2D();
 
-    if(_input->info()->data_layout() == DataLayout::NHWC)
+    if(_data_layout == DataLayout::NHWC)
     {
         const Window tmp_win     = window.collapse_if_possible(ICLKernel::window(), 3);
         const int    num_batches = tmp_win[3].end();
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index 8e69157..e3f1114 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -172,7 +172,7 @@
 } // namespace
 
 CLPoolingLayerKernel::CLPoolingLayerKernel()
-    : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0), _num_elems_processed_per_iteration(1)
+    : _input(nullptr), _output(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _border_size(0), _num_elems_processed_per_iteration(1)
 {
 }
 
@@ -185,13 +185,18 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
+    // Set instance variables
+    _input       = input;
+    _output      = output;
+    _pool_info   = pool_info;
+    _data_layout = input->info()->data_layout();
+
     int                 pool_stride_x   = 0;
     int                 pool_stride_y   = 0;
     const PoolingType   pool_type       = pool_info.pool_type();
-    DataLayout          data_layout     = input->info()->data_layout();
-    const int           idx_width       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height      = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int           idx_channel     = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const int           idx_width       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int           idx_height      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const int           idx_channel     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
     const int           pool_size_x     = pool_info.is_global_pooling() ? input->info()->dimension(idx_width) : pool_info.pool_size().width;
     const int           pool_size_y     = pool_info.is_global_pooling() ? input->info()->dimension(idx_height) : pool_info.pool_size().height;
     const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
@@ -218,11 +223,6 @@
     auto_init(input->info(), output->info(), pool_info);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info));
 
-    // Set instance variables
-    _input     = input;
-    _output    = output;
-    _pool_info = pool_info;
-
     const DataType data_type = input->info()->data_type();
 
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
@@ -243,7 +243,7 @@
     build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION");
 
     // Create kernel
-    switch(data_layout)
+    switch(_data_layout)
     {
         case DataLayout::NCHW:
         {
@@ -292,7 +292,7 @@
     ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
     ICLKernel::configure_internal(std::get<1>(win_config));
 
-    if(data_layout == DataLayout::NCHW)
+    if(_data_layout == DataLayout::NCHW)
     {
         CLPoolingConfig pooling_config     = std::get<2>(win_config);
         _num_elems_processed_per_iteration = pooling_config.first;
@@ -308,7 +308,7 @@
     _config_id = "pooling_layer_";
     _config_id += lower_string(string_from_data_type(data_type));
     _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(data_layout));
+    _config_id += lower_string(string_from_data_layout(_data_layout));
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(idx_width));
     _config_id += "_";
@@ -339,7 +339,7 @@
     // Collapse window
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
 
-    switch(_input->info()->data_layout())
+    switch(_data_layout)
     {
         case DataLayout::NCHW:
         {
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index 488313f..82c5c8a 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp
@@ -160,11 +160,12 @@
 
 void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy)
 {
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy));
+
     _input               = input;
     _output              = output;
     _interpolationPolicy = policy;
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy));
+    _data_layout         = input->info()->data_layout();
 
     float wr = 0.f;
     float hr = 0.f;
@@ -172,10 +173,9 @@
 
     const bool call_quantized_kernel = is_data_type_quantized_asymmetric(input->info()->data_type()) && policy == InterpolationPolicy::BILINEAR;
 
-    DataLayout data_layout = input->info()->data_layout();
-    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const bool is_nhwc     = data_layout == DataLayout::NHWC;
+    const int  idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const bool is_nhwc    = _data_layout == DataLayout::NHWC;
 
     // Compute the ratio between source width/height and destination width/height
     const unsigned int input_width   = input->info()->dimension(idx_width);
@@ -215,7 +215,7 @@
     std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
     std::string kernel_name = "scale_" + interpolation_name;
     kernel_name += call_quantized_kernel ? "_quantized_" : "_";
-    kernel_name += lower_string(string_from_data_layout(data_layout));
+    kernel_name += lower_string(string_from_data_layout(_data_layout));
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     unsigned int idx = is_nhwc ? 2 * num_arguments_per_4D_tensor() : 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
@@ -249,7 +249,7 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    switch(_input->info()->data_layout())
+    switch(_data_layout)
     {
         case DataLayout::NCHW:
         {
diff --git a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
index 331b02d..9d2532e 100644
--- a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
@@ -37,7 +37,7 @@
 namespace arm_compute
 {
 CLUpsampleLayerKernel::CLUpsampleLayerKernel()
-    : _input(nullptr), _output(nullptr), _info(), _num_elems_processed_per_iteration_input_x()
+    : _input(nullptr), _output(nullptr), _info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration_input_x()
 {
 }
 
@@ -71,13 +71,12 @@
     _input                                     = input;
     _output                                    = output;
     _info                                      = info;
+    _data_layout                               = input->info()->data_layout();
     _num_elems_processed_per_iteration_input_x = 1;
 
-    const DataLayout data_layout = input->info()->data_layout();
-
     TensorShape output_shape = misc::shape_calculator::compute_upsample_shape(*input->info(), info);
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
-    output->info()->set_data_layout(data_layout);
+    output->info()->set_data_layout(_data_layout);
 
     unsigned int num_elems_processed_per_iteration_x = 16;
     const int    output_width_x                      = output->info()->dimension(0);
@@ -88,7 +87,7 @@
 
     Window win{};
 
-    switch(data_layout)
+    switch(_data_layout)
     {
         case DataLayout::NCHW:
         {
@@ -140,8 +139,7 @@
     Window slice_out        = collapsed_window.first_slice_window_3D();
     Window slice_in         = collapsed_window.first_slice_window_3D();
 
-    DataLayout data_layout = _input->info()->data_layout();
-    switch(data_layout)
+    switch(_data_layout)
     {
         case DataLayout::NCHW:
             slice_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration_input_x));
diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
index 1c31ceb..6125790 100644
--- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -99,7 +99,7 @@
 } // namespace
 
 CLWinogradInputTransformKernel::CLWinogradInputTransformKernel()
-    : _border_size(0), _input(nullptr), _output(nullptr), _num_tiles_x(0), _num_tiles_y(0), _step_z(1)
+    : _border_size(0), _input(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _num_tiles_x(0), _num_tiles_y(0), _step_z(1)
 {
 }
 
@@ -116,16 +116,17 @@
     const PadStrideInfo conv_info        = winograd_info.convolution_info;
     const Size2D        output_tile_size = winograd_info.output_tile_size;
     const Size2D        kernel_size      = winograd_info.kernel_size;
-    const DataLayout    data_layout      = input->info()->data_layout();
 
-    const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    _data_layout = input->info()->data_layout();
+
+    const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
 
     // Compute number of elements to process in the X and Y direction
     const int num_elements_x = input->info()->dimension(idx_w) - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right();
     const int num_elements_y = input->info()->dimension(idx_h) - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom();
 
-    if(data_layout == DataLayout::NCHW)
+    if(_data_layout == DataLayout::NCHW)
     {
         // Check if we need to extend the right or bottom border
         const unsigned int extra_border_right  = ((num_elements_x % output_tile_size.width) == 0) ? 0u : static_cast<unsigned int>(output_tile_size.width - 1);
@@ -166,7 +167,7 @@
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL");
     build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL");
-    if(data_layout == DataLayout::NHWC)
+    if(_data_layout == DataLayout::NHWC)
     {
         build_opts.add_option_if(total_batches > 1, "-DNUM_TILES_Y=" + support::cpp11::to_string(_num_tiles_y));
         build_opts.add_option("-DSRC_DIM_1=" + support::cpp11::to_string(_input->info()->dimension(1)));
@@ -184,7 +185,7 @@
     const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height);
 
     // Check optimized kernel if output_dims == 2x2
-    if((tile_max_dim == 2) && (data_layout == DataLayout::NCHW))
+    if((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW))
     {
         _step_z = (_input->info()->dimension(2) % 2) != 0 ? 1 : 2;
     }
@@ -192,7 +193,7 @@
     // Append stepz and data layout
     kernel_name += "_stepz";
     kernel_name += support::cpp11::to_string(_step_z);
-    kernel_name += "_" + lower_string(string_from_data_layout(data_layout));
+    kernel_name += "_" + lower_string(string_from_data_layout(_data_layout));
 
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
@@ -212,7 +213,7 @@
     _config_id += "_";
     _config_id += support::cpp11::to_string(conv_info.pad_top());
     _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
+    _config_id += lower_string(string_from_data_layout(_data_layout));
 }
 
 Status CLWinogradInputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
@@ -229,11 +230,10 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    const DataLayout data_layout   = _input->info()->data_layout();
-    const size_t     idx_w         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const size_t     idx_h         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const size_t     idx_c         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-    const size_t     total_batches = window.shape().total_size_upper(3);
+    const size_t idx_w         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const size_t idx_c         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+    const size_t total_batches = window.shape().total_size_upper(3);
 
     // Collapse window
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
@@ -241,7 +241,7 @@
     Window slice = window_collapsed.first_slice_window_3D();
     slice.set(idx_w, Window::Dimension(0, _num_tiles_x, 1));
     slice.set(idx_h, Window::Dimension(0, _num_tiles_y, 1));
-    if(data_layout == DataLayout::NHWC)
+    if(_data_layout == DataLayout::NHWC)
     {
         slice.set(idx_h, Window::Dimension(0, _num_tiles_y * total_batches, 1));
     }
diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
index df631c3..98b0c10 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
@@ -63,7 +63,7 @@
 } // namespace
 
 NEDepthToSpaceLayerKernel::NEDepthToSpaceLayerKernel()
-    : _input(nullptr), _output(nullptr), _block_shape()
+    : _input(nullptr), _output(nullptr), _block_shape(), _data_layout(DataLayout::UNKNOWN)
 {
 }
 
@@ -80,6 +80,7 @@
     _input       = input;
     _output      = output;
     _block_shape = block_shape;
+    _data_layout = input->info()->data_layout();
 
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps());
@@ -99,7 +100,7 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
 
-    const int idx_channel  = get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+    const int idx_channel  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
     const int depth_size   = _input->info()->dimension(idx_channel);
     const int r            = (depth_size / (_block_shape * _block_shape));
     const int element_size = _input->info()->element_size();
@@ -112,7 +113,7 @@
     slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
 
     // Main loop for NCHW and NHWC
-    if(_input->info()->data_layout() == DataLayout::NCHW)
+    if(_data_layout == DataLayout::NCHW)
     {
         Window slice_in = window.first_slice_window_2D();
         do
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 0641d6c..27c3d66 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -265,10 +265,9 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    const DataLayout   data_layout = _input->info()->data_layout();
-    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const unsigned int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
 
     const int input_w        = _input->info()->dimension(width_idx);
     const int input_h        = _input->info()->dimension(height_idx);
@@ -344,7 +343,7 @@
 }
 
 NEIm2ColKernel::NEIm2ColKernel()
-    : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_width(0), _kernel_height(0), _has_bias(false), _dilation(1U, 1U)
+    : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_width(0), _kernel_height(0), _has_bias(false), _dilation(1U, 1U), _data_layout(DataLayout::UNKNOWN)
 {
 }
 
@@ -355,9 +354,9 @@
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups));
     ARM_COMPUTE_UNUSED(num_groups);
 
-    const DataLayout   data_layout = input->info()->data_layout();
-    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    _data_layout                  = input->info()->data_layout();
+    const unsigned int width_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
 
     _input          = input;
     _output         = output;
@@ -370,7 +369,7 @@
                                         _conv_info, _dilation);
     _has_bias = has_bias;
 
-    if(data_layout == DataLayout::NCHW)
+    if(_data_layout == DataLayout::NCHW)
     {
         switch(_input->info()->data_type())
         {
diff --git a/src/core/NEON/kernels/NEPadLayerKernel.cpp b/src/core/NEON/kernels/NEPadLayerKernel.cpp
index 88a1c2e..2b63a15 100644
--- a/src/core/NEON/kernels/NEPadLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPadLayerKernel.cpp
@@ -87,21 +87,21 @@
     const size_t start_plane = window.z().start();
     const size_t end_plane   = window.z().end();
 
-    const size_t start_plane_input = start_plane - (_padding.size() > 2 && start_plane >= _padding[2].first ? _padding[2].first : 0);
-
+    size_t start_plane_input = start_plane;
+    if(_padding.size() > 2)
+    {
+        start_plane_input = (start_plane < _padding[2].first) ? 0 : start_plane - _padding[2].first;
+    }
     const int output_plane_size = _output->info()->dimension(0) * _output->info()->dimension(1);
-    const int input_plane_size  = (_input->info()->dimension(0) + _input->info()->padding().right + _input->info()->padding().left) * (_input->info()->dimension(
-                                      1)
-                                  + _input->info()->padding().top + _input->info()->padding().bottom);
+    const int input_plane_size  = _input->info()->dimension(0) * _input->info()->dimension(1);
 
     const int pad_y_elems_top = (_padding.size() > 1 ? _padding[1].first : 0) * _output->info()->dimension(0);
     const int pad_y_elems_bot = (_padding.size() > 1 ? _padding[1].second : 0) * _output->info()->dimension(0);
 
-    const size_t jump_to_next_row_input   = _input->info()->dimension(0) + _input->info()->padding().right + _input->info()->padding().left;
-    const size_t jump_to_next_row_output  = _padding[0].first + _padding[0].second;
-    const size_t jump_to_next_plane_input = _input->info()->padding().empty() ? 0 : _input->info()->dimension(0) * (_input->info()->padding().right + _input->info()->padding().top);
+    const size_t jump_to_next_row_input  = _input->info()->dimension(0);
+    const size_t jump_to_next_row_output = _padding[0].first + _padding[0].second;
 
-    uint8_t       *output_row_ptr = _output->buffer() + start_plane * output_plane_size;
+    uint8_t       *output_row_ptr = _output->buffer() + _output->info()->offset_first_element_in_bytes() + start_plane * output_plane_size;
     const uint8_t *input_it_ptr   = _input->buffer() + _input->info()->offset_first_element_in_bytes() + start_plane_input * input_plane_size;
     const auto     pad_value      = _constant_value.get<uint8_t>();
 
@@ -112,7 +112,7 @@
             memset(output_row_ptr, pad_value, output_plane_size);
             output_row_ptr += output_plane_size;
         }
-        else if(_padding.size() > 2 && z_i > _input->info()->dimension(2) + _padding[2].first - 1)
+        else if(_padding.size() > 2 && z_i > (_input->info()->dimension(2) + _padding[2].first - 1))
         {
             memset(output_row_ptr, pad_value, output_plane_size);
             output_row_ptr += output_plane_size;
@@ -168,7 +168,6 @@
                 memset(output_row_ptr, pad_value, _padding[0].second);
                 output_row_ptr += _padding[0].second;
             }
-            input_it_ptr += jump_to_next_plane_input;
             memset(output_row_ptr, pad_value, pad_y_elems_bot);
             output_row_ptr += pad_y_elems_bot;
         }
@@ -202,7 +201,9 @@
         switch(_input->info()->element_size())
         {
             case 1:
-                if(_input->info()->num_dimensions() == 3 && padding.size() <= 3)
+                if(_input->info()->num_dimensions() == 3 &&                           // Is 3D
+                   padding.size() <= 3 &&                                             // Has 3D padding
+                   !_input->info()->has_padding() && !_output->info()->has_padding()) // Input & Output have no padding
                 {
                     _func = &NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad;
                 }
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index aaeb33f..59c3543 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -321,7 +321,7 @@
 } // namespace
 
 NEPoolingLayerKernel::NEPoolingLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
+    : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
 {
 }
 
@@ -364,14 +364,15 @@
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h));
 
     // Set instance variables
-    _input     = input;
-    _output    = output;
-    _pool_info = pool_info;
-    _is_square = (pool_size.x() == pool_size.y());
+    _input       = input;
+    _output      = output;
+    _pool_info   = pool_info;
+    _data_layout = input->info()->data_layout();
+    _is_square   = (pool_size.x() == pool_size.y());
 
     // Get data type
     const DataType data_type = input->info()->data_type();
-    const bool     is_nchw   = data_layout == DataLayout::NCHW;
+    const bool     is_nchw   = _data_layout == DataLayout::NCHW;
 
     if(data_type == DataType::QASYMM8)
     {
@@ -1578,7 +1579,12 @@
         // Calculate square-root in case of l2 pooling
         if(pooling_type == PoolingType::L2)
         {
-            vres = vmulq_f32(vres, vinvsqrtq_f32(vres));
+            float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
+                                   static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
+                                   static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
+                                   static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
+                                 };
+            vres = l2_res;
         }
 
         // Store result
@@ -1841,7 +1847,7 @@
     const bool         exclude_padding = _pool_info.exclude_padding();
 
     Window window_input(window);
-    if(_input->info()->data_layout() == DataLayout::NCHW)
+    if(_data_layout == DataLayout::NCHW)
     {
         // Set step for input in x and y direction for the input
         unsigned int window_x_inc = 0;
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index a2ce0de..5dc4b55 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -742,23 +742,8 @@
 
             for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
             {
-                T *in_ptr;
-                switch(axis)
-                {
-                    case 1:
-                        in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, dim)));
-                        break;
-                    case 2:
-                        in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, dim)));
-                        break;
-                    case 3:
-                        in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, dim)));
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Not supported");
-                }
+                const T   *in_ptr       = reinterpret_cast<T *>(input.ptr() + in_info.strides_in_bytes()[axis] * dim);
                 const auto vec_elements = wrapper::vloadq(in_ptr);
-
                 switch(op)
                 {
                     case ReductionOperation::SUM:
@@ -907,23 +892,8 @@
 
             for(unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
             {
-                uint8_t *in_ptr;
-                switch(axis)
-                {
-                    case 1:
-                        in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, index_dim));
-                        break;
-                    case 2:
-                        in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, index_dim));
-                        break;
-                    case 3:
-                        in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, index_dim));
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Not supported");
-                }
-                const auto vec_elements = wrapper::vloadq(in_ptr);
-
+                const uint8_t *in_ptr       = input.ptr() + in_info.strides_in_bytes()[axis] * index_dim;
+                const auto     vec_elements = wrapper::vloadq(in_ptr);
                 switch(op)
                 {
                     case ReductionOperation::SUM:
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index 80da54f..1cea8c6 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -909,7 +909,7 @@
 void NEScaleKernel::scale_nhwc(const Window &window)
 {
     // Get data layout and width/height indices
-    const DataLayout data_layout  = _input->info()->data_layout();
+    const DataLayout data_layout  = DataLayout::NHWC;
     const int        idx_channels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
     const int        idx_width    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int        idx_height   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
index 4803365..ffd2dc1 100644
--- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
@@ -66,7 +66,7 @@
 } // namespace
 
 NESpaceToDepthLayerKernel::NESpaceToDepthLayerKernel()
-    : _input(nullptr), _output(nullptr), _block_shape()
+    : _input(nullptr), _output(nullptr), _block_shape(), _data_layout(DataLayout::UNKNOWN)
 {
 }
 
@@ -82,6 +82,7 @@
     _input       = input;
     _block_shape = block_shape;
     _output      = output;
+    _data_layout = input->info()->data_layout();
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
@@ -100,9 +101,8 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
 
-    const DataLayout data_layout  = _input->info()->data_layout();
-    const int        channel_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-    const int        element_size = _input->info()->element_size();
+    const int channel_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+    const int element_size = _input->info()->element_size();
 
     const size_t channel_size = _input->info()->dimension(channel_idx);
 
@@ -111,7 +111,7 @@
     int batch_id = 0;
 
     // Main loop for NCHW and NHWC
-    if(_output->info()->data_layout() == DataLayout::NCHW)
+    if(_data_layout == DataLayout::NCHW)
     {
         do
         {
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index a3634cd..c5de43d 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -26,20 +26,81 @@
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/ToolchainSupport.h"
 
 namespace arm_compute
 {
+namespace
+{
+Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+{
+    ARM_COMPUTE_UNUSED(keep_dims);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+
+    const unsigned int reduction_ops = reduction_axis.num_dimensions();
+    const int          input_dims    = input->num_dimensions();
+    Coordinates        axis_local    = reduction_axis;
+
+    for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
+    {
+        //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)).
+        ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions())));
+        ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions()));
+    }
+
+    if(output->tensor_shape().total_size() != 0)
+    {
+        // Only validate if not using auto_init for the output tensor
+        TensorShape out_shape = input->tensor_shape();
+        // Validate output_shape only if not using auto_init
+        convert_negative_axis(axis_local, input_dims);
+        std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+        for(unsigned int i = 0; i < reduction_ops; ++i)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+            ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
+            if(output->total_size() > 0 && keep_dims)
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+            }
+            if(keep_dims)
+            {
+                out_shape.set(axis_local[i], 1);
+            }
+            else
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON(i > static_cast<unsigned int>(axis_local[i]));
+                const unsigned int remove_index = axis_local[i] - i;
+                ARM_COMPUTE_RETURN_ERROR_ON(remove_index >= out_shape.num_dimensions());
+                out_shape.remove_dimension(remove_index);
+            }
+        }
+        const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+    }
+    return Status{};
+}
+}
 CLReduceMean::CLReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims()
 {
 }
 void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(CLReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
+    // Output auto inizialitation if not yet initialized
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input, reduction_axis, keep_dims);
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     _reduction_ops = reduction_axis.num_dimensions();
     _reduction_kernels.resize(_reduction_ops);
@@ -49,14 +110,10 @@
     Coordinates axis_local = reduction_axis;
     const int   input_dims = input->info()->num_dimensions();
 
-    // Convert negative axis
-    for(unsigned int i = 0; i < _reduction_ops; ++i)
-    {
-        axis_local[i] = wrap_around(axis_local[i], input_dims);
-    }
+    convert_negative_axis(axis_local, input_dims);
 
     // Perform reduction for every axis
-    for(unsigned int i = 0; i < _reduction_ops; ++i)
+    for(int i = 0; i < _reduction_ops; ++i)
     {
         TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
         out_shape.set(axis_local[i], 1);
@@ -75,7 +132,7 @@
     }
 
     // Allocate intermediate tensors
-    for(unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+    for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
     {
         _reduced_outs[i].allocator()->allocate();
     }
@@ -88,7 +145,7 @@
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
         std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
-        for(unsigned int i = 0; i < _reduction_ops; ++i)
+        for(int i = 0; i < _reduction_ops; ++i)
         {
             out_shape.remove_dimension(axis_local[i] - i);
         }
@@ -99,55 +156,16 @@
 
 Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
-
-    TensorShape out_shape = input->tensor_shape();
-
-    Coordinates        axis_sorted   = reduction_axis;
-    const unsigned int reduction_ops = reduction_axis.num_dimensions();
-    const int          input_dims    = input->num_dimensions();
-
-    // Convert negative axis
-    for(unsigned int i = 0; i < reduction_ops; ++i)
-    {
-        axis_sorted[i] = wrap_around(axis_sorted[i], input_dims);
-    }
-
-    std::sort(axis_sorted.begin(), axis_sorted.begin() + reduction_ops);
-    for(unsigned int i = 0; i < reduction_ops; ++i)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(axis_sorted[i] > 3);
-        ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_sorted[i]) > input->num_dimensions() - 1);
-        if(output->total_size() > 0 && keep_dims)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_sorted[i]) != 1);
-        }
-        if(keep_dims)
-        {
-            out_shape.set(axis_sorted[i], 1);
-        }
-        else
-        {
-            out_shape.remove_dimension(axis_sorted[i] - i);
-        }
-    }
-
-    const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
-
-    return Status{};
+    return validate_config(input, reduction_axis, keep_dims, output);
 }
 
 void CLReduceMean::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    for(unsigned int i = 0; i < _reduction_ops; ++i)
+    for(auto &kernel : _reduction_kernels)
     {
-        _reduction_kernels[i].run();
+        kernel.run();
     }
 
     if(!_keep_dims)
diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index 10437f5..72c63a8 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp
@@ -26,40 +26,13 @@
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 namespace arm_compute
 {
 namespace
 {
-inline TensorShape calculate_reduce_mean_shape(ITensor *input, const Coordinates &reduction_axis, bool keep_dims)
-{
-    const int   reduction_ops = reduction_axis.num_dimensions();
-    Coordinates axis_local    = reduction_axis;
-    const int   input_dims    = input->info()->num_dimensions();
-    convert_negative_axis(axis_local, input_dims);
-    TensorShape out_shape = input->info()->tensor_shape();
-    // Configure reshape layer if we want to drop the dimensions
-    if(!keep_dims)
-    {
-        // We have to sort the reduction axis vectors in order for remove_dimension
-        // to work properly
-        std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
-        for(int i = 0; i < reduction_ops; ++i)
-        {
-            out_shape.remove_dimension(axis_local[i] - i);
-        }
-        return out_shape;
-    }
-    else
-    {
-        for(int i = 0; i < reduction_ops; ++i)
-        {
-            out_shape.set(axis_local[i], 1);
-        }
-        return out_shape;
-    }
-}
 } // namespace
 
 NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
@@ -130,7 +103,7 @@
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(NEReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
     // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = calculate_reduce_mean_shape(input, reduction_axis, keep_dims);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input, reduction_axis, keep_dims);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     _reduction_ops = reduction_axis.num_dimensions();