arm_compute v18.05
diff --git a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
new file mode 100644
index 0000000..b5b159a
--- /dev/null
+++ b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
+
+using namespace arm_compute;
+
+NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights()
+    : _kernel()
+{
+}
+
+void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape,
+                                               DataLayout data_layout)
+{
+    _kernel.configure(input, output, original_input_shape, data_layout);
+}
+
+Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
+                                                DataLayout data_layout)
+{
+    return NEConvertFullyConnectedWeightsKernel::validate(input, output, original_input_shape, data_layout);
+}
+
+void NEConvertFullyConnectedWeights::run()
+{
+    NEScheduler::get().schedule(&_kernel, Window::DimZ);
+}
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 0a49158..7053c7e 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -30,41 +30,44 @@
 
 #include <cmath>
 #include <tuple>
+#include <utility>
 
 namespace arm_compute
 {
-NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_manager(std::move(memory_manager)), _function()
+NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) //NOLINT
+    : _memory_manager(std::move(memory_manager)),
+      _function()
 {
 }
 
-void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info));
+    ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
+                                                            enable_fast_math));
 
-    switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
-                                                      weights_info))
+    switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, weights_info, dilation, act_info))
     {
         case ConvolutionMethod::WINOGRAD:
         {
-            auto f = arm_compute::support::cpp14::make_unique<NEWinogradLayer>(_memory_manager);
-            f->configure(input, weights, biases, output, conv_info);
+            auto f = arm_compute::support::cpp14::make_unique<NEWinogradConvolutionLayer>(_memory_manager);
+            f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math);
             _function = std::move(f);
             break;
         }
         case ConvolutionMethod::GEMM:
         {
             auto f = arm_compute::support::cpp14::make_unique<NEGEMMConvolutionLayer>(_memory_manager);
-            f->configure(input, weights, biases, output, conv_info, weights_info);
+            f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info);
             _function = std::move(f);
             break;
         }
         case ConvolutionMethod::DIRECT:
         {
             auto f = arm_compute::support::cpp14::make_unique<NEDirectConvolutionLayer>(_memory_manager);
-            f->configure(input, weights, biases, output, conv_info);
+            f->configure(input, weights, biases, output, conv_info, act_info);
             _function = std::move(f);
             break;
         }
@@ -75,21 +78,21 @@
 }
 
 Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                    const WeightsInfo &weights_info)
+                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
 {
-    switch(NEConvolutionLayer::get_convolution_method(input, weights, biases, output, conv_info, weights_info))
+    switch(NEConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info))
     {
         case ConvolutionMethod::WINOGRAD:
             //Validate Winograd
-            NEWinogradLayer::validate(input, weights, biases, output, conv_info);
+            ARM_COMPUTE_RETURN_ON_ERROR(NEWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
             break;
         case ConvolutionMethod::GEMM:
             //Validate Gemm-based Convolution
-            NEGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info);
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info));
             break;
         case ConvolutionMethod::DIRECT:
             //Validate Gemm-based Convolution
-            NEDirectConvolutionLayer::validate(input, weights, biases, output, conv_info);
+            ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
         default:
             ARM_COMPUTE_ERROR("Not supported.");
             break;
@@ -98,17 +101,20 @@
     return Status{};
 }
 
-ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                             const WeightsInfo &weights_info)
+ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights,
+                                                             const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                                             const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
 {
-    ARM_COMPUTE_UNUSED(output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights);
     ARM_COMPUTE_UNUSED(weights_info);
-    if((input->data_type() == DataType::F32) && (weights->dimension(0) == 3) && (weights->dimension(1) == 3) && (weights->num_dimensions() <= 4) && (conv_info.stride().first == 1)
-       && (conv_info.stride().second == 1) && (biases != nullptr))
+
+    if(dilation != Size2D(1U, 1U) || Scheduler::get().cpu_info().get_cpu_model() == CPUModel::A53
+       || input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) <= 16)
     {
-        return ConvolutionMethod::WINOGRAD;
+        return ConvolutionMethod::GEMM;
     }
-    return ConvolutionMethod::GEMM;
+
+    return bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
 }
 
 void NEConvolutionLayer::run()
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index c1ba5dd..40ada8f 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -34,6 +34,7 @@
 NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
     : _memory_group(std::move(memory_manager)),
       _conv_f(),
+      _upsample_f(),
       _scaled_output(),
       _input(nullptr),
       _info(),
@@ -41,13 +42,64 @@
 {
 }
 
+Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info,
+                                      unsigned int inner_border_right, unsigned int inner_border_top)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->dimension(1));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) < 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
+
+    const unsigned int stride_x = info.stride().first;
+    const unsigned int stride_y = info.stride().second;
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_right > stride_x - 1, "inner_border_right must be smaller than stride_x");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_top > stride_y - 1, "inner_border_top must be smaller than stride_y");
+
+    auto out_dims = deconvolution_output_dimensions(input->dimension(0), input->dimension(1), weights->dimension(0), weights->dimension(1),
+                                                    info.pad().first, info.pad().second, inner_border_right, inner_border_top, stride_x, stride_y);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights, bias);
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, bias);
+    }
+
+    if(output->tensor_shape().total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+        const TensorShape output_shape = deconvolution_output_shape(out_dims, input->tensor_shape(), weights->tensor_shape());
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
+    }
+
+    TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_deconvolution_shape(*input, stride_x, stride_y, inner_border_right, inner_border_top,
+                                                                                                      info)));
+    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != scale_out_info.dimension(i));
+    }
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, WeightsInfo()));
+
+    return Status{};
+}
+
 void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info,
                                      unsigned int inner_border_right, unsigned int inner_border_top)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1));
-    ARM_COMPUTE_ERROR_ON(!info.padding_is_symmetric());
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
     _input        = input;
     _info         = info;
@@ -55,15 +107,9 @@
 
     const unsigned int stride_x = info.stride().first;
     const unsigned int stride_y = info.stride().second;
-    auto               out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
-                                                                  info.pad().first, info.pad().second, inner_border_right, inner_border_top, stride_x, stride_y);
 
-    const TensorShape output_shape = deconvolution_output_shape(out_dims, input->info()->tensor_shape(), weights->info()->tensor_shape());
-
-    ARM_COMPUTE_UNUSED(output_shape);
-    ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
-    ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
-    ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, inner_border_right, inner_border_top));
 
     _memory_group.manage(&_scaled_output);
 
@@ -79,44 +125,20 @@
 
     // Allocate auxiliary tensors
     _scaled_output.allocator()->allocate();
+
+    // configure upsample function
+    _upsample_f.configure(input, &_scaled_output, info, inner_border_right, inner_border_top);
 }
 
 void NEDeconvolutionLayer::run()
 {
     _memory_group.acquire();
 
-    // Initialize _scaled_output buffer
-    const int width_in      = _input->info()->dimension(0);
-    const int height_in     = _input->info()->dimension(1);
-    const int width_scaled  = _scaled_output.info()->dimension(0);
-    const int height_scaled = _scaled_output.info()->dimension(1);
-    const int num_2d_slices = _input->info()->tensor_shape().total_size() / (width_in * height_in);
-    const int stride_x      = _info.stride().first;
-    const int stride_y      = _info.stride().second;
-
-    std::fill_n(_scaled_output.buffer(), _scaled_output.info()->total_size(), 0);
-
-    // scaled_output is the input for the forward convolution. We copy the input elements to scaled_output
-    // and insert rows and columns with zeroes depending on the stride values.
-    for(int slice = 0; slice < num_2d_slices; ++slice)
-    {
-        const int start_x = _info.pad().first;
-        const int start_y = _inner_border.second + _info.pad().second;
-        const int end_y   = height_scaled - _info.pad().second;
-        const int end_x   = width_scaled - _inner_border.first - _info.pad().first;
-
-        for(int yi = start_y, in_y = 0; yi < end_y; yi += stride_y, in_y++)
-        {
-            for(int xi = start_x, in_x = 0; xi < end_x; xi += stride_x, in_x++)
-            {
-                const auto in = *(reinterpret_cast<float *>(_input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(in_x, in_y, slice))));
-                *(reinterpret_cast<float *>(_scaled_output.buffer() + _scaled_output.info()->offset_element_in_bytes(Coordinates(xi, yi, slice)))) = in;
-            }
-        }
-    }
+    // Run upsample kernel
+    _upsample_f.run();
 
     // Run convolution layer
     _conv_f.run();
 
     _memory_group.release();
-}
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 95fcf88..0a977ad 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -37,11 +37,11 @@
 
 NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
     : _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _accumulator(), _input_nhwc(), _weights_hwio(), _output_nhwc(), _has_bias(false),
-      _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false)
+      _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false), _is_nchw(true), _is_first_run(true)
 {
 }
 
-void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
@@ -52,30 +52,39 @@
     _has_bias     = biases != nullptr;
     _is_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->info()->tensor_shape(),
                                                                                           conv_info,
-                                                                                          input->info()->data_type());
+                                                                                          input->info()->data_type(),
+                                                                                          depth_multiplier,
+                                                                                          input->info()->data_layout());
     _are_weights_reshaped = false;
+    _is_nchw              = input->info()->data_layout() == DataLayout::NCHW;
+
+    ARM_COMPUTE_ERROR_ON(!_is_optimized && !_is_nchw);
 
     if(_is_optimized)
     {
-        // Configure the function to transform the input tensor from NCHW -> NHWC
-        _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
+        if(_is_nchw)
+        {
+            // Configure the function to transform the input tensor from NCHW -> NHWC
+            _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
 
-        // Configure the function to transform the weights tensor from IHW -> HWI
-        _permute_weights.configure(weights, &_weights_hwio, PermutationVector(2U, 0U, 1U));
+            // Configure the function to transform the weights tensor from IHW -> HWI
+            _permute_weights.configure(weights, &_weights_hwio, PermutationVector(2U, 0U, 1U));
 
-        // Configure optimized depthwise
-        _dwc_kernel.configure(&_input_nhwc, &_weights_hwio, &_output_nhwc, conv_info, DataLayout::NHWC);
+            // Configure optimized depthwise
+            _dwc_kernel.configure(&_input_nhwc, &_weights_hwio, &_output_nhwc, conv_info, depth_multiplier, DataLayout::NHWC);
 
-        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
-        _permute_output.configure(&_output_nhwc, output, PermutationVector(1U, 2U, 0U));
+            // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+            _permute_output.configure(&_output_nhwc, output, PermutationVector(1U, 2U, 0U));
 
-        // Allocate tensors
-        _input_nhwc.allocator()->allocate();
-        _weights_hwio.allocator()->allocate();
-        _output_nhwc.allocator()->allocate();
-
-        // Create convolver (deferred)
-        _dwc_kernel.generate_convolver();
+            // Allocate tensors
+            _input_nhwc.allocator()->allocate();
+            _weights_hwio.allocator()->allocate();
+            _output_nhwc.allocator()->allocate();
+        }
+        else
+        {
+            _dwc_kernel.configure(input, weights, output, conv_info, depth_multiplier, DataLayout::NHWC);
+        }
     }
     else
     {
@@ -88,7 +97,7 @@
         }
 
         // Configure depthwise convolution kernel
-        _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info);
+        _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier);
 
         // Configure border handler
         _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
@@ -116,8 +125,15 @@
 
 void NEDepthwiseConvolutionLayer3x3::run()
 {
+    if(_is_first_run && _is_optimized)
+    {
+        _is_first_run = false;
+        // Create convolver (deferred)
+        _dwc_kernel.generate_convolver();
+    }
+
     // Permute weights in HWIO format if the optimized kernel will be executedd
-    if(!_are_weights_reshaped && _is_optimized)
+    if(!_are_weights_reshaped && _is_optimized && _is_nchw)
     {
         _are_weights_reshaped = true;
         _permute_weights.run();
@@ -126,8 +142,11 @@
     // Handle input
     if(_is_optimized)
     {
-        // Permute input to NHWC format execution
-        _permute_input.run();
+        if(_is_nchw)
+        {
+            // Permute input to NHWC format execution
+            _permute_input.run();
+        }
     }
     else
     {
@@ -139,7 +158,7 @@
     NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
 
     // Permute output to ACL's native NCHW format in case of NHWC execution
-    if(_is_optimized)
+    if(_is_optimized && _is_nchw)
     {
         _permute_output.run();
     }
@@ -153,31 +172,37 @@
 
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
     : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
-      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_quantized(false)
+      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr)
 {
 }
 
-void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON((input->info()->dimension(2) * depth_multiplier) != weights->info()->dimension(2));
 
     const size_t weights_w = weights->info()->dimension(0);
     const size_t weights_h = weights->info()->dimension(1);
     const size_t weights_z = weights->info()->dimension(2);
 
-    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _is_first_run     = true;
+    _original_weights = weights;
 
     // Should bias be appended ?
     bool append_bias = (biases != nullptr) && !_is_quantized;
 
     // Calculate output shape
-    TensorShape dwc_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info);
+    TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
 
     // Output width and height
-    const unsigned int conv_w = dwc_output_shape.x();
-    const unsigned int conv_h = dwc_output_shape.y();
+    const unsigned int conv_w = output_shape.x();
+    const unsigned int conv_h = output_shape.y();
 
     // Set up intermediate tensors
     const size_t patch_size = weights_w * weights_h + (append_bias ? 1 : 0);
@@ -189,7 +214,7 @@
     shape_im2col.set(1, conv_size);
     shape_im2col.set(2, weights_z);
     _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
-    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias);
+    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
 
     // Weights reshape configuration
     const TensorShape shape_weights_reshape(patch_size, weights_z);
@@ -204,7 +229,7 @@
     shape_v2mm_out.set(2, 1);
     _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
     _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
-    _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(dwc_output_shape));
+    _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
     _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
 
     // Output staged configuration
@@ -241,10 +266,21 @@
 
 void NEDepthwiseConvolutionLayer::run()
 {
+    // Run weights reshaping (Runs once for every configure)
+    if(_is_first_run)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
+        NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
+        _is_first_run = false;
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
+    }
+
     NEScheduler::get().schedule(&_im2col_kernel, Window::DimX);
-    NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
     NEScheduler::get().schedule(&_v2mm_input_fill_border, Window::DimX);
-    NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
     NEScheduler::get().schedule(&_v2mm_kernel, Window::DimX);
     NEScheduler::get().schedule(&_vector_to_tensor_kernel, Window::DimX);
     if(_is_quantized)
diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
index a58b6e4..0627977 100644
--- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 using namespace arm_compute;
@@ -34,8 +35,18 @@
 {
 }
 
+Status NEDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
+    ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayerKernel::validate(input, output, min_max));
+
+    return Status{};
+}
+
 void NEDequantizationLayer::configure(const ITensor *input, ITensor *output, const ITensor *min_max)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
+
     // Configure kernel
     _dequantize_kernel.configure(input, output, min_max);
 }
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index c26c99a..445864c 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,18 +34,23 @@
 using namespace arm_compute;
 
 NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _accumulator(), _has_bias(false), _is_fixed_point(false)
+    : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false), _is_fixed_point(false),
+      _is_activationlayer_enabled(false), _dim_split(Window::DimZ)
 {
 }
 
-void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info)
+void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
 {
+    ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
+
     // Free accumulator
     if(_accumulator.buffer() != nullptr)
     {
         _accumulator.allocator()->free();
     }
 
+    _dim_split = input->info()->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY;
+
     // Check if bias should be added in the convolution result
     _has_bias = (bias != nullptr);
 
@@ -73,9 +78,17 @@
 
     // Add zero padding XY
     _input_border_handler.configure(input, _conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+
+    //Configure Activation Layer
+    _is_activationlayer_enabled = act_info.enabled();
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.configure(output, nullptr, act_info);
+    }
 }
 
-Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info)
+Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
 
@@ -101,6 +114,11 @@
     // Validate bias kernel
     ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, bias, output));
 
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
+    }
+
     return Status{};
 }
 
@@ -110,10 +128,15 @@
 
     _memory_group.acquire();
 
-    NEScheduler::get().schedule(&_conv_kernel, Window::DimZ);
+    NEScheduler::get().schedule(&_conv_kernel, _dim_split);
     if(_has_bias || _is_fixed_point)
     {
         NEScheduler::get().schedule(&_output_stage_kernel, Window::DimY);
     }
+
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.run();
+    }
     _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 26b7271..958d081 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -132,7 +132,7 @@
 
 NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(),
-      _reshape_weights_output(), _are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false)
+      _reshape_weights_output(), _are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false), _original_weights(nullptr)
 {
 }
 
@@ -163,6 +163,7 @@
     const int    num_input_dimensions = input->info()->tensor_shape().num_dimensions() - num_batch_dimensions;
     const size_t linear_input_size    = input->info()->tensor_shape().total_size_lower(num_input_dimensions);
 
+    _original_weights     = weights;
     _linearize_input      = (input->info()->tensor_shape().x() != linear_input_size) || (num_input_dimensions > 1 && linear_input_size == 1);
     _are_weights_reshaped = are_weights_reshaped;
     _accumulate_biases    = biases != nullptr;
@@ -187,7 +188,7 @@
 
     if(_linearize_input)
     {
-        _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_shape(input->info(), num_input_dimensions)));
+        _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_fc_shape(input->info(), num_input_dimensions)));
 
         // Configure im2col kernel
         _memory_group.manage(&_im2col_output);
@@ -287,7 +288,7 @@
 
     if(linearize_input)
     {
-        im2col_output->set_tensor_shape(compute_im2col_shape(input, num_input_dimensions));
+        im2col_output->set_tensor_shape(compute_im2col_fc_shape(input, num_input_dimensions));
 
         ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, im2col_output.get(), Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, true));
 
@@ -324,8 +325,13 @@
     // Reshape of the weights (happens only once)
     if(!_are_weights_reshaped)
     {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
         _are_weights_reshaped = true;
         _reshape_weights_kernel.run();
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
     }
 
     _memory_group.acquire();
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 05907ba..9168ed4 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -26,37 +26,20 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/AssemblyHelper.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "support/ToolchainSupport.h"
 
-namespace arm_compute
-{
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wswitch-default"
-#pragma GCC diagnostic ignored "-Weffc++"
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp"
-#pragma GCC diagnostic pop
-} // namespace arm_compute
-
 #include <cmath>
 
 namespace arm_compute
 {
 NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _mm_optimised_kernel(nullptr), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(),
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(), _B_pretransposed(),
       _run_vector_matrix_multiplication(false), _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
 {
 }
@@ -83,41 +66,14 @@
     _reshape_b_only_on_first_run      = gemm_info.reshape_b_only_on_first_run();
     _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
 
+    const bool run_optimised = a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f)
+                               && setup_assembly_kernel(a, b, d, alpha, beta, _reshape_b_only_on_first_run, _workspace, _B_pretransposed, _memory_group, _asm_glue);
+
     // Check if the first input tensor is a vector.
     // If so, all the kernels for reshaping the tensors can be skipped
     if(_run_vector_matrix_multiplication)
     {
-#if defined(__aarch64__)
-        if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
-        {
-            _mm_optimised_kernel = support::cpp14::make_unique<NEGEMVAArch64Kernel>();
-        }
-
-        if(_mm_optimised_kernel != nullptr)
-        {
-            struct CPUInfo ci = NEScheduler::get().cpu_info();
-
-            const int N = d->info()->tensor_shape().x();
-            const int K = a->info()->tensor_shape().x();
-
-            size_t workbench_size = 0;
-
-            if(a->info()->data_type() == DataType::F32)
-            {
-                workbench_size = GemvTransposed<sgemv_trans, sgemv_trans::operand_type, sgemv_trans::result_type>(&ci, N, K).get_working_size();
-            }
-
-            constexpr size_t alignment = 4096;
-            ARM_COMPUTE_ERROR_ON_MSG(workbench_size == 0, "size cannot be 0");
-            _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::S8));
-            _memory_group.manage(&_workspace);
-
-            // Configure matrix multiplication kernel
-            _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f, false /* is_transposed_0 */, false /* is_transposed_1 */);
-            _workspace.allocator()->allocate();
-        }
-        else
-#endif /* defined(__aarch64__) */
+        if(!run_optimised)
         {
             // Configure the matrix multiply kernel
             _mm_kernel.configure(a, b, d, alpha, false);
@@ -132,65 +88,7 @@
     }
     else
     {
-#if defined(__arm__)
-        if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
-        {
-            _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
-        }
-#elif defined(__aarch64__)
-        if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
-        {
-            _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
-        }
-        else if(a->info()->data_type() == DataType::F16 && (c == nullptr || beta == 0.f))
-        {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            _mm_optimised_kernel = support::cpp14::make_unique<NEHGEMMAArch64FP16Kernel>();
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        }
-#endif /* defined(__arm__) || defined(__aarch64__) */
-
-#if defined(__arm__) || defined(__aarch64__)
-        if(_mm_optimised_kernel != nullptr)
-        {
-            struct CPUInfo ci = NEScheduler::get().cpu_info();
-
-            const int M = d->info()->tensor_shape().y();
-            const int N = d->info()->tensor_shape().x();
-            const int K = a->info()->tensor_shape().x();
-
-            size_t workbench_size = 0;
-
-#if defined(__arm__)
-            workbench_size = GemmInterleaved<sgemm_8x6, sgemm_8x6::operand_type, sgemm_8x6::result_type>(&ci, M, N, K, false, false).get_working_size();
-#elif defined(__aarch64__)
-            if(a->info()->data_type() == DataType::F32)
-            {
-                workbench_size = GemmInterleaved<sgemm_12x8, sgemm_12x8::operand_type, sgemm_12x8::result_type>(&ci, M, N, K, false, false).get_working_size();
-            }
-            else if(a->info()->data_type() == DataType::F16)
-            {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                workbench_size = GemmInterleaved<hgemm_24x8, hgemm_24x8::operand_type, hgemm_24x8::result_type>(&ci, M, N, K, false, false).get_working_size();
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            }
-#endif /* defined(__arm__) || defined(__aarch64__) */
-
-            constexpr size_t alignment = 4096;
-            ARM_COMPUTE_ERROR_ON_MSG(workbench_size == 0, "size cannot be 0");
-            _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::S8));
-            _memory_group.manage(&_workspace);
-
-            // Configure matrix multiplication kernel
-            _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f, false /* is_transposed_0 */, false /* is_transposed_1 */);
-            _workspace.allocator()->allocate();
-        }
-        else
-#endif /* defined(__arm__) || defined(__aarch64__) */
+        if(!run_optimised)
         {
             TensorShape shape_tmp_a = a->info()->tensor_shape();
             TensorShape shape_tmp_b = b->info()->tensor_shape();
@@ -210,7 +108,10 @@
 
             // Manage intermediate buffers
             _memory_group.manage(&_tmp_a);
-            _memory_group.manage(&_tmp_b);
+            if(!_reshape_b_only_on_first_run)
+            {
+                _memory_group.manage(&_tmp_b);
+            }
 
             int m = a->info()->dimension(1);
             int n = b->info()->dimension(0);
@@ -243,9 +144,9 @@
 {
     _memory_group.acquire();
 
-    if(_mm_optimised_kernel != nullptr)
+    if(_asm_glue._optimised_kernel != nullptr)
     {
-        NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
+        _asm_glue.run();
         _memory_group.release();
     }
     else
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index a85078c..2888b43 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -23,9 +23,6 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
 
-#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Utils.h"
@@ -34,13 +31,6 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "support/ToolchainSupport.h"
 
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
-} // namespace arm_compute
-
 #include <cmath>
 #include <tuple>
 
@@ -175,19 +165,28 @@
     }
 }
 
-Status validate_and_initialize_values(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, DataType &dt,
-                                      bool &append_bias,
+Status validate_and_initialize_values(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+                                      const ActivationLayerInfo &act_info, DataType &dt,
+                                      bool &append_bias, bool &skip_im2col,
                                       bool &are_weights_reshaped, unsigned int &kernel_width, unsigned int &kernel_height,
-                                      bool &is_fully_connected_convolution, bool &is_interleaved, bool &is_quantized,
+                                      bool &is_fully_connected_convolution, bool &is_interleaved, bool &is_quantized, bool &is_activationlayer_enabled,
                                       unsigned int &mat_weights_cols, unsigned int &mat_weights_rows,
-                                      unsigned int &conv_w, unsigned int &conv_h)
+                                      unsigned int &conv_w, unsigned int &conv_h, const Size2D &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON(!weights_info.are_reshaped() && weights->dimension(2) != input->dimension(2));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+    DataLayout data_layout = input->data_layout();
+    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int  idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(!weights_info.are_reshaped() && weights->dimension(idx_channel) != input->dimension(idx_channel));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(weights_info.are_reshaped() && is_data_type_quantized_asymmetric(input->data_type()));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(data_layout == DataLayout::NHWC && input->data_type() != DataType::F32, "NHWC is only supported for FP32 data type.");
 
     dt           = input->data_type();
     is_quantized = is_data_type_quantized_asymmetric(dt);
@@ -207,28 +206,32 @@
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
     }
 
+    // If we have 1x1 convolution and data layout is NHWC we can disable im2col
     append_bias          = (biases != nullptr) && (!is_quantized);
     are_weights_reshaped = weights_info.are_reshaped();
-    kernel_width         = (are_weights_reshaped) ? weights_info.kernel_size().first : weights->dimension(0);
-    kernel_height        = (are_weights_reshaped) ? weights_info.kernel_size().second : weights->dimension(1);
+    kernel_width         = (are_weights_reshaped) ? weights_info.kernel_size().first : weights->dimension(idx_width);
+    kernel_height        = (are_weights_reshaped) ? weights_info.kernel_size().second : weights->dimension(idx_height);
     mat_weights_cols     = weights->dimension(3);
-    mat_weights_rows     = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + (append_bias ? 1 : 0);
+    mat_weights_rows     = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel) + ((append_bias && !skip_im2col) ? 1 : 0);
+    skip_im2col          = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1);
 
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
-                                                 conv_info);
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(idx_width), input->dimension(idx_height), kernel_width, kernel_height,
+                                                 conv_info, dilation);
 
     // Check if its a "fully connected" convolution
     is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
     is_interleaved                 = (!is_fully_connected_convolution && !is_quantized);
+    is_activationlayer_enabled     = act_info.enabled();
 
     return Status{};
 }
 } // namespace
 
 NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
-    : _memory_group(memory_manager), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_optimised_kernel(nullptr), _mm_gemmlowp(memory_manager),
-      _gemmlowp_output_stage(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _tmp_output(), _workspace(), _append_bias(false),
-      _is_fully_connected_convolution(false), _are_weights_reshaped(false), _is_quantized(false), _is_interleaved(false)
+    : _asm_glue(), _memory_group(memory_manager), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(),
+      _output_col2im_kernel(), _activationlayer_function(), _add_bias_kernel(), _original_weights(nullptr), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(),
+      _tmp_output(), _workspace(), _B_pretransposed(), _data_layout(DataLayout::NCHW), _append_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false), _is_quantized(false),
+      _is_interleaved(false), _is_activationlayer_enabled(false), _skip_im2col(false)
 {
 }
 
@@ -256,26 +259,8 @@
     }
 }
 
-void NEGEMMConvolutionLayer::configure_asm_mm(const struct CPUInfo &ci, int M, int N, int K)
-{
-    ARM_COMPUTE_UNUSED(ci);
-    ARM_COMPUTE_UNUSED(M);
-    ARM_COMPUTE_UNUSED(N);
-    ARM_COMPUTE_UNUSED(K);
-#if defined(__arm__) || defined(__aarch64__)
-#if defined(__arm__)
-    GemmInterleaved<sgemm_8x6, float, float> gemm(&ci, M, N, K, false, false);
-#elif defined(__aarch64__)
-    GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
-#endif /* defined(__arm__) || defined(__aarch64__) */
-
-    constexpr size_t alignment = 4096;
-    _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
-    _memory_group.manage(&_workspace);
-#endif /* defined(__arm__) || defined(__aarch64__) */
-}
-
-void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+                                       const Size2D &dilation, const ActivationLayerInfo &act_info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
@@ -288,45 +273,35 @@
     unsigned int conv_w           = 0;
     unsigned int conv_h           = 0;
 
-    Status status = validate_and_initialize_values(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), conv_info, weights_info, dt, _append_bias, _are_weights_reshaped,
+    _data_layout           = input->info()->data_layout();
+    const bool is_nhwc     = _data_layout == DataLayout::NHWC;
+    const int  idx_width   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const int  idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+
+    Status status = validate_and_initialize_values(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), conv_info, weights_info, act_info, dt, _append_bias, _skip_im2col,
+                                                   _are_weights_reshaped,
                                                    kernel_width, kernel_height,
-                                                   _is_fully_connected_convolution, _is_interleaved, _is_quantized,
-                                                   mat_weights_cols, mat_weights_rows, conv_w, conv_h);
+                                                   _is_fully_connected_convolution, _is_interleaved, _is_quantized, _is_activationlayer_enabled,
+                                                   mat_weights_cols, mat_weights_rows, conv_w, conv_h, dilation);
 
     ARM_COMPUTE_ERROR_THROW_ON(status);
 
+    _original_weights                       = weights;
     const unsigned int fixed_point_position = input->info()->fixed_point_position();
     const ITensor     *biases_to_use        = (_append_bias) ? biases : nullptr;
 
-#if defined(__arm__)
-    if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32)
-    {
-        _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
-    }
-#elif defined(__aarch64__)
-    if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32)
-    {
-        _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
-    }
-#endif /* defined(__arm__) || defined(__aarch64__) */
+    bool run_optimised = dt == DataType::F32;
 
     // Reshape weights if needed
-    if(_mm_optimised_kernel != nullptr)
+    if(run_optimised)
     {
-        if(_are_weights_reshaped)
-        {
-            mat_weights_cols = weights_info.num_kernels();
-            mat_weights_rows = weights->info()->dimension(1);
-        }
-        else
-        {
-            TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
+        TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
 
-            // Create tensor to store the reshaped weights
-            _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
-            _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
-            weights = &_weights_reshaped;
-        }
+        // Create tensor to store the reshaped weights
+        _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
+        _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
+        weights = &_weights_reshaped;
     }
     else
     {
@@ -335,12 +310,12 @@
             if(_is_fully_connected_convolution || _is_quantized)
             {
                 mat_weights_cols = weights_info.num_kernels();
-                mat_weights_rows = weights->info()->dimension(1);
+                mat_weights_rows = weights->info()->dimension(idx_height);
             }
             else
             {
                 mat_weights_cols = weights_info.num_kernels();
-                mat_weights_rows = weights_info.kernel_size().first * weights_info.kernel_size().second * input->info()->dimension(2) + (_append_bias ? 1 : 0);
+                mat_weights_rows = weights_info.kernel_size().first * weights_info.kernel_size().second * input->info()->dimension(idx_channel) + (_append_bias ? 1 : 0);
             }
         }
         else
@@ -366,66 +341,56 @@
         }
     }
 
-    // Create tensor to store im2col reshaped inputs
-    const unsigned int mat_input_cols = mat_weights_rows;
-    const unsigned int mat_input_rows = conv_w * conv_h;
-
-    TensorShape shape_im2col(input->info()->tensor_shape());
-    shape_im2col.set(0, mat_input_cols);
-    shape_im2col.set(1, mat_input_rows);
-    shape_im2col.set(2, 1);
-    _input_im2col_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
-    _memory_group.manage(&_input_im2col_reshaped);
-
-    // Create tensor (interleave) to prepare input tensor for GEMM
-    if(!_is_fully_connected_convolution && _mm_optimised_kernel == nullptr)
+    // In case we skip im2col we have to add bias
+    if(!_skip_im2col)
     {
-        TensorShape shape_interleaved(shape_im2col);
-        shape_interleaved.set(0, shape_interleaved.x() * 4);
-        shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
-        _input_interleaved_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_interleaved));
-        _memory_group.manage(&_input_interleaved_reshaped);
+        const unsigned int mat_input_cols = mat_weights_rows;
+        const unsigned int mat_input_rows = conv_w * conv_h;
+
+        // Create tensor to store im2col reshaped inputs
+        TensorShape shape_im2col(input->info()->tensor_shape());
+        shape_im2col.set(0, mat_input_cols);
+        shape_im2col.set(1, mat_input_rows);
+        shape_im2col.set(2, 1);
+        _input_im2col_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+        _memory_group.manage(&_input_im2col_reshaped);
+
+        // Create tensor (interleave) to prepare input tensor for GEMM
+        if(!_is_fully_connected_convolution && !run_optimised && _is_interleaved)
+        {
+            TensorShape shape_interleaved(shape_im2col);
+            shape_interleaved.set(idx_width, shape_interleaved.x() * 4);
+            shape_interleaved.set(idx_height, std::ceil(shape_interleaved[idx_height] / 4.f));
+            _input_interleaved_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_interleaved));
+            _memory_group.manage(&_input_interleaved_reshaped);
+        }
+
+        // Create GEMM output tensor
+        TensorShape shape_gemm(_input_im2col_reshaped.info()->tensor_shape());
+        shape_gemm.set(0, mat_weights_cols);
+        shape_gemm.set(1, mat_input_rows);
+        const DataType gemm_data_type = _is_quantized ? DataType::S32 : dt;
+        // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
+        TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->info()->fixed_point_position());
+        info_gemm.set_quantization_info(output->info()->quantization_info());
+        _gemm_output.allocator()->init(info_gemm);
+
+        // Configure im2col
+        _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _append_bias, false, false, dilation);
+    }
+    else if(_append_bias)
+    {
+        // Configure add bias kernel
+        _add_bias_kernel.configure(output, biases, output, ConvertPolicy::SATURATE);
     }
 
-    // Create GEMM output tensor
-    TensorShape shape_gemm(_input_im2col_reshaped.info()->tensor_shape());
-    shape_gemm.set(0, mat_weights_cols);
-    shape_gemm.set(1, mat_input_rows);
-    const DataType gemm_data_type = _is_quantized ? DataType::S32 : dt;
-    // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
-    TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->info()->fixed_point_position());
-    info_gemm.set_quantization_info(output->info()->quantization_info());
-    _gemm_output.allocator()->init(info_gemm);
-    _memory_group.manage(&_gemm_output);
-
-    // Configure kernels
-    // Configure im2col
-    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _append_bias);
-
     // Configure matrix multiply
-    if(_mm_optimised_kernel != nullptr)
+    if(run_optimised)
     {
-        struct CPUInfo ci = NEScheduler::get().cpu_info();
-
-        const int M = _gemm_output.info()->tensor_shape().y();
-        const int N = _gemm_output.info()->tensor_shape().x();
-        const int K = _input_im2col_reshaped.info()->tensor_shape().x();
-
-#if defined(__aarch64__)
-        if((N <= 128) && (K <= 128))
+        if(!setup_assembly_kernel(_skip_im2col ? input : &_input_im2col_reshaped, weights, is_nhwc ? output : &_gemm_output, 1.f, 0.f, true, _workspace, _B_pretransposed, _memory_group, _asm_glue))
         {
-            _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64NativeKernel>();
+            ARM_COMPUTE_ERROR("setup_assembly_kernel failed.");
         }
-        else
-#endif /* defined(__aarch64__) */
-        {
-            configure_asm_mm(ci, M, N, K);
-        }
-
-        // Configure matrix multiplication kernel
-        _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace);
-
-        _workspace.allocator()->allocate();
     }
     else
     {
@@ -435,8 +400,8 @@
             _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
 
             // Configure GEMM
-            configure_mm(&_input_interleaved_reshaped, weights, &_gemm_output, _is_interleaved, GEMMReshapeInfo(_input_im2col_reshaped.info()->dimension(1), 0 /* no transpose */,
-                                                                                                                _input_im2col_reshaped.info()->dimension(0)));
+            configure_mm(&_input_interleaved_reshaped, weights, &_gemm_output, _is_interleaved, GEMMReshapeInfo(_input_im2col_reshaped.info()->dimension(idx_height), 0 /* no transpose */,
+                                                                                                                _input_im2col_reshaped.info()->dimension(idx_width)));
             _input_interleaved_reshaped.allocator()->allocate();
         }
         else
@@ -445,48 +410,63 @@
         }
     }
 
-    _input_im2col_reshaped.allocator()->allocate();
-
-    // Configure output stage for quantized case
-    if(_is_quantized)
+    if(!_skip_im2col)
     {
-        const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+        _input_im2col_reshaped.allocator()->allocate();
 
-        float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
-        int   output_multiplier, output_shift;
-        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-        _memory_group.manage(&_tmp_output);
-        _gemmlowp_output_stage.configure(&_gemm_output, biases, &_tmp_output, output_multiplier, output_shift, output_quant_info.offset);
+        // Configure output stage for quantized case
+        if(_is_quantized)
+        {
+            const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+
+            float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+            int   output_multiplier, output_shift;
+            quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+            _memory_group.manage(&_tmp_output);
+            _gemmlowp_output_stage.configure(&_gemm_output, biases, &_tmp_output, output_multiplier, output_shift, output_quant_info.offset);
+        }
+
+        // Configure Col2Im
+        if(!is_nhwc)
+        {
+            _output_col2im_kernel.configure(_is_quantized ? &_tmp_output : &_gemm_output, output, Size2D(conv_w, conv_h));
+        }
+
+        if(_is_quantized)
+        {
+            _tmp_output.allocator()->allocate();
+        }
+        _gemm_output.allocator()->allocate();
     }
 
-    // Configure Col2Im
-    _output_col2im_kernel.configure(_is_quantized ? &_tmp_output : &_gemm_output, output, Size2D(conv_w, conv_h));
-    if(_is_quantized)
-    {
-        _tmp_output.allocator()->allocate();
-    }
-    _gemm_output.allocator()->allocate();
-
-    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h), "Output shape does not match the expected one");
 
     // Allocate intermediate tensor
     if(!_are_weights_reshaped)
     {
         _weights_reshaped.allocator()->allocate();
     }
+
+    //Configure Activation Layer
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.configure(output, nullptr, act_info);
+    }
 }
 
 Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                        const WeightsInfo &weights_info)
+                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(output);
 
     DataType     dt{};
     bool         append_bias{};
+    bool         skip_im2col{};
     bool         are_weights_reshaped{};
     bool         is_fully_connected_convolution{};
     bool         is_interleaved{};
     bool         is_quantized{};
+    bool         is_activationlayer_enabled{};
     unsigned int kernel_width     = 0;
     unsigned int kernel_height    = 0;
     unsigned int mat_weights_cols = 0;
@@ -494,9 +474,14 @@
     unsigned int conv_w           = 0;
     unsigned int conv_h           = 0;
 
-    Status status = validate_and_initialize_values(input, weights, biases, conv_info, weights_info, dt, append_bias, are_weights_reshaped, kernel_width, kernel_height,
-                                                   is_fully_connected_convolution, is_interleaved, is_quantized, mat_weights_cols, mat_weights_rows,
-                                                   conv_w, conv_h);
+    const DataLayout data_layout = input->data_layout();
+    const bool       is_nhwc     = data_layout == DataLayout::NHWC;
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    Status status = validate_and_initialize_values(input, weights, biases, conv_info, weights_info, act_info, dt, append_bias, skip_im2col, are_weights_reshaped, kernel_width, kernel_height,
+                                                   is_fully_connected_convolution, is_interleaved, is_quantized, is_activationlayer_enabled, mat_weights_cols, mat_weights_rows,
+                                                   conv_w, conv_h, dilation);
 
     const Size2D kernel_weights = Size2D(kernel_width, kernel_height);
 
@@ -505,68 +490,11 @@
     std::unique_ptr<ITensorInfo> reshaped_weights = weights->clone();
     bool                         optimised_kernel = false;
 
-#if defined(__arm__)
-    if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32)
+    if(dt == DataType::F32)
     {
         optimised_kernel = true;
     }
-#elif defined(__aarch64__)
-    if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32)
-    {
-        optimised_kernel = true;
-    }
-#endif /* defined(__arm__) || defined(__aarch64__) */
 
-    // Reshape weights if needed
-    if(optimised_kernel)
-    {
-        if(are_weights_reshaped)
-        {
-            mat_weights_cols = weights_info.num_kernels();
-            mat_weights_rows = weights->dimension(1);
-        }
-        else
-        {
-            TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
-
-            // Create tensor to store the reshaped weights
-            reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, append_bias, is_fully_connected_convolution));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
-            weights = reshaped_weights.get();
-        }
-    }
-    else
-    {
-        if(are_weights_reshaped)
-        {
-            const unsigned int transpose_width = 16 / input->element_size();
-            mat_weights_cols                   = weights_info.num_kernels();
-            mat_weights_rows                   = weights->dimension(0) / transpose_width + (append_bias ? 1 : 0);
-        }
-        else
-        {
-            TensorShape reshaped_weights_shape;
-
-            if(is_fully_connected_convolution || is_quantized)
-            {
-                reshaped_weights_shape = TensorShape{ mat_weights_cols, mat_weights_rows };
-            }
-            else
-            {
-                // Create tensor to store transposed weights
-                const float transpose_width = 16.0f / input->element_size();
-                reshaped_weights_shape      = TensorShape{ mat_weights_rows *static_cast<unsigned int>(transpose_width),
-                                                           static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)) };
-            }
-
-            // Create tensor to store the reshaped weights
-            reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, append_bias, is_fully_connected_convolution));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
-            weights = reshaped_weights.get();
-        }
-    }
-
-    // Validate im2col
     const unsigned int mat_input_cols = mat_weights_rows;
     const unsigned int mat_input_rows = conv_w * conv_h;
     TensorShape        shape_im2col   = input->tensor_shape();
@@ -574,7 +502,17 @@
     shape_im2col.set(1, mat_input_rows);
     shape_im2col.set(2, 1);
     TensorInfo im2_col_info = input->clone()->set_tensor_shape(shape_im2col);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2_col_info, kernel_weights, conv_info, append_bias, false));
+
+    if(!skip_im2col)
+    {
+        // Validate im2col
+        ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2_col_info, kernel_weights, conv_info, append_bias, false, false, dilation));
+    }
+    else if(append_bias)
+    {
+        // Validate add bias kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(output, biases, output, ConvertPolicy::SATURATE));
+    }
 
     // Create GEMM output tensor
     TensorShape shape_gemm(im2_col_info.tensor_shape());
@@ -582,19 +520,63 @@
     shape_gemm.set(1, mat_input_rows);
     TensorInfo gemm_output_info = input->clone()->set_tensor_shape(shape_gemm);
 
-    // Validate GEMM interleave and multiply
-    if(is_interleaved)
+    // Reshape weights if needed
+    if(optimised_kernel)
     {
-        TensorShape shape_interleaved = shape_im2col;
-        shape_interleaved.set(0, shape_interleaved.x() * 4);
-        shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
-        TensorInfo input_interleaved_info = input->clone()->set_tensor_shape(shape_interleaved);
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(&im2_col_info, &input_interleaved_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&input_interleaved_info, weights, &gemm_output_info, 1.f, is_interleaved, GEMMReshapeInfo()));
+        ARM_COMPUTE_RETURN_ERROR_ON(are_weights_reshaped);
+
+        // Create tensor to store the reshaped weights
+        reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, append_bias, is_fully_connected_convolution));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
     }
-    else
+    else if(!is_quantized)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&im2_col_info, weights, &gemm_output_info, 1.f, is_interleaved, GEMMReshapeInfo()));
+        TensorShape reshaped_weights_shape;
+
+        if(is_fully_connected_convolution || is_quantized)
+        {
+            reshaped_weights_shape = TensorShape{ mat_weights_cols, mat_weights_rows };
+        }
+        else
+        {
+            // Create tensor to store transposed weights
+            const float transpose_width = 16.0f / input->element_size();
+            reshaped_weights_shape      = TensorShape{ mat_weights_rows *static_cast<unsigned int>(transpose_width),
+                                                       static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)) };
+        }
+
+        // Create tensor to store the reshaped weights
+        reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, append_bias, is_fully_connected_convolution));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
+        weights = reshaped_weights.get();
+
+        // Validate GEMM interleave and multiply
+        if(is_interleaved)
+        {
+            TensorShape shape_interleaved = shape_im2col;
+            shape_interleaved.set(idx_width, shape_interleaved.x() * 4);
+            shape_interleaved.set(idx_height, std::ceil(shape_interleaved.y() / 4.f));
+            TensorInfo input_interleaved_info = input->clone()->set_tensor_shape(shape_interleaved);
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(&im2_col_info, &input_interleaved_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&input_interleaved_info, weights, &gemm_output_info, 1.f, is_interleaved, GEMMReshapeInfo(shape_im2col[1],            // m
+                                                                             weights->tensor_shape()[0], // n
+                                                                             shape_im2col[0]) /* k */));
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&im2_col_info, weights, &gemm_output_info, 1.f, is_interleaved, GEMMReshapeInfo()));
+        }
+    }
+    if(!is_nhwc)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(idx_width) != conv_w) || (output->dimension(idx_height) != conv_h), "Output shape does not match the expected one");
+
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
     }
 
     return Status{};
@@ -605,19 +587,33 @@
     // Run weights reshaping (Runs once for every configure)
     if(!_are_weights_reshaped)
     {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
         _are_weights_reshaped = true;
         _reshape_weights.run();
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
     }
 
     _memory_group.acquire();
 
-    // Run input reshaping
-    NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
+    if(!_skip_im2col)
+    {
+        // Run input reshaping
+        unsigned int _y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+        NEScheduler::get().schedule(&_input_im2col_kernel, _y_dim);
+    }
 
     // Runs matrix multiply on reshaped matrices
-    if(_mm_optimised_kernel != nullptr)
+    if(_asm_glue._optimised_kernel != nullptr)
     {
-        NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
+        _asm_glue.run();
+        // Release weights in case buffer is pretransposed
+        if(!_weights_reshaped.is_used())
+        {
+            _weights_reshaped.allocator()->free();
+        }
     }
     else
     {
@@ -638,6 +634,11 @@
         }
     }
 
+    if(_skip_im2col && _append_bias)
+    {
+        NEScheduler::get().schedule(&_add_bias_kernel, Window::DimY);
+    }
+
     // Run output stage for quantized case
     if(_is_quantized)
     {
@@ -645,7 +646,15 @@
     }
 
     // Reshape output matrix
-    NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+    if(_data_layout == DataLayout::NCHW)
+    {
+        NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+    }
+
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.run();
+    }
 
     _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
index 9b36e81..98b4767 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2017 ARM Limited.
+/* Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,13 +25,9 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
@@ -39,20 +35,11 @@
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "support/ToolchainSupport.h"
 
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp"
-} // namespace arm_compute
-
 using namespace arm_compute;
 
 NEGEMMLowpAssemblyMatrixMultiplyCore::NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b(), _workspace()
+    : _memory_group(std::move(memory_manager)), _asm_glue_unsigned(), _asm_glue_signed(), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b(),
+      _workspace(), _B_pretransposed()
 {
 }
 
@@ -65,89 +52,29 @@
     ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
     ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
 
+    bool run_optimised = false;
 #ifdef __aarch64__
-    const int            M                   = output->info()->tensor_shape().y();
-    const int            N                   = output->info()->tensor_shape().x();
-    const int            K                   = a->info()->tensor_shape().x();
-    constexpr size_t     workspace_alignment = 4096;
-    const struct CPUInfo ci                  = NEScheduler::get().cpu_info();
+    switch(a->info()->data_type())
+    {
+        case DataType::S8:
+        {
+            run_optimised = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretransposed, _memory_group, _asm_glue_signed);
+            break;
+        }
+        case DataType::QASYMM8:
+        case DataType::U8:
+        {
+            run_optimised = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretransposed, _memory_group, _asm_glue_unsigned);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Datatype not supported");
+            break;
+        }
+    }
 #endif /* __aarch64__ */
-
-#ifdef ARM_COMPUTE_AARCH64_V8_2
-    if(ci.CPU == CPUTarget::A75_DOT || ci.CPU == CPUTarget::A55_DOT)
-    {
-        // Configure matrix multiply kernel
-        GemmInterleaved<gemm_s8_12x8, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
-        _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
-        _memory_group.manage(&_workspace);
-
-        // Configure matrix multiplication kernel
-        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>();
-        k->configure(a, b, output, &_workspace, 1.f, 1.f);
-        _mm_kernel = std::move(k);
-        _workspace.allocator()->allocate();
-    }
-    else
-#elif defined(ARM_COMPUTE_AARCH64_V8A)
-    if(ci.CPU == CPUTarget::A53)
-    {
-        switch(a->info()->data_type())
-        {
-            case DataType::S8:
-            {
-                // Configure matrix multiply kernel
-                GemmInterleaved<gemm_s16_12x8, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
-                _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
-            }
-            break;
-            case DataType::U8:
-            {
-                // Configure matrix multiply kernel
-                GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t> gemm(&ci, M, N, K, false, false);
-                _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
-            }
-            break;
-            default:
-                ARM_COMPUTE_ERROR("Datatype not supported");
-        }
-
-        _memory_group.manage(&_workspace);
-        // Configure matrix multiplication kernel
-        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64A53Kernel>();
-        k->configure(a, b, output, &_workspace, 1.f, 1.f);
-        _mm_kernel = std::move(k);
-        _workspace.allocator()->allocate();
-    }
-    else if(1) // Generic v8a kernel
-    {
-        switch(a->info()->data_type())
-        {
-            case DataType::S8:
-            {
-                // Configure matrix multiply kernel
-                GemmInterleaved<gemm_s8_4x4, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
-                _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
-            }
-            break;
-            case DataType::U8:
-            {
-                // Configure matrix multiply kernel
-                GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t> gemm(&ci, M, N, K, false, false);
-                _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
-            }
-            break;
-            default:
-                ARM_COMPUTE_ERROR("Datatype not supported");
-        }
-        _memory_group.manage(&_workspace);
-        // Configure matrix multiplication kernel
-        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64Kernel>();
-        k->configure(a, b, output, &_workspace, 1.f, 1.f);
-        _mm_kernel = std::move(k);
-        _workspace.allocator()->allocate();
-    }
-    else
-#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+    if(!run_optimised)
     {
         // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
         TensorShape shape_tmp_a = a->info()->tensor_shape();
@@ -206,7 +133,18 @@
         NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
     }
 
-    NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+    if(_asm_glue_unsigned._optimised_kernel != nullptr)
+    {
+        _asm_glue_unsigned.run();
+    }
+    else if(_asm_glue_signed._optimised_kernel != nullptr)
+    {
+        _asm_glue_signed.run();
+    }
+    else
+    {
+        NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+    }
 
     _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index ad47593..2e06fa2 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -26,11 +26,9 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
@@ -39,58 +37,48 @@
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "support/ToolchainSupport.h"
 
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp"
-} // namespace arm_compute
-
 using namespace arm_compute;
 using namespace arm_compute::misc::shape_calculator;
 
 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
-      _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _dot_product_path(false)
+    : _memory_group(std::move(memory_manager)), _asm_glue_unsigned(), _asm_glue_signed(), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(),
+      _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _B_pretranspose(), _a_offset(0), _b_offset(0),
+      _run_vector_matrix_multiplication(false), _dot_product_path(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
 {
 }
 
 void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, ITensor *output, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-    ARM_COMPUTE_UNUSED(gemm_info);
     ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));
 
     _a_offset                         = a->info()->quantization_info().offset;
     _b_offset                         = b->info()->quantization_info().offset;
     _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
+    _reshape_b_only_on_first_run      = gemm_info.reshape_b_only_on_first_run();
 
-#ifdef ARM_COMPUTE_AARCH64_V8_2
-    // Check for DOT product instruction
-    const struct CPUInfo ci              = NEScheduler::get().cpu_info();
-    const int            cpu_has_dotprod = static_cast<int>(ci.CPU) & static_cast<int>(CPUTarget::DOT);
-
-    if(cpu_has_dotprod != 0)
+#ifdef __aarch64__
+    switch(a->info()->data_type())
     {
-        _dot_product_path = true;
-
-        // Configure matrix multiply kernel
-        struct CPUInfo ci = NEScheduler::get().cpu_info();
-        const int      M  = output->info()->tensor_shape().y();
-        const int      N  = output->info()->tensor_shape().x();
-        const int      K  = a->info()->tensor_shape().x();
-
-        const size_t     workbench_size = GemmInterleaved<gemm_u8_12x8, gemm_u8_12x8::operand_type, gemm_u8_12x8::result_type>(&ci, M, N, K, false, false).get_working_size();
-        constexpr size_t alignment      = 4096;
-        _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
-        _memory_group.manage(&_workspace);
-
-        // Configure matrix multiplication kernel
-        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>();
-        k->configure(a, b, output, &_workspace, 1.f, 1.f, false, false);
-        _mm_kernel = std::move(k);
+        case DataType::S8:
+        {
+            _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretranspose, _memory_group, _asm_glue_signed);
+            break;
+        }
+        case DataType::QASYMM8:
+        case DataType::U8:
+        {
+            _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretranspose, _memory_group, _asm_glue_unsigned);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Datatype not supported");
+            break;
+        }
     }
-    else
-#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+#endif /* __aarch64__ */
+    if(!_dot_product_path)
     {
         if(_run_vector_matrix_multiplication)
         {
@@ -110,7 +98,10 @@
             _tmp_a.allocator()->init(info_a);
             _tmp_b.allocator()->init(info_b);
             _memory_group.manage(&_tmp_a);
-            _memory_group.manage(&_tmp_b);
+            if(!_reshape_b_only_on_first_run)
+            {
+                _memory_group.manage(&_tmp_b);
+            }
 
             // Configure interleave kernel
             {
@@ -141,7 +132,10 @@
         TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
 
         _vector_sum_col.allocator()->init(info_vector_sum_col);
-        _memory_group.manage(&_vector_sum_col);
+        if(!_reshape_b_only_on_first_run)
+        {
+            _memory_group.manage(&_vector_sum_col);
+        }
 
         // Configure Matrix B reduction kernel
         _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a->info()->dimension(0), false);
@@ -168,10 +162,6 @@
         _tmp_a.allocator()->allocate();
         _tmp_b.allocator()->allocate();
     }
-    else
-    {
-        _workspace.allocator()->allocate();
-    }
 
     if(_a_offset != 0)
     {
@@ -203,42 +193,28 @@
     int32_t b_offset                         = b->quantization_info().offset;
     bool    run_vector_matrix_multiplication = a->dimension(1) < 2;
 
-#ifdef ARM_COMPUTE_AARCH64_V8_2
-    // Check for DOT product instruction
-    const struct CPUInfo ci              = NEScheduler::get().cpu_info();
-    const int            cpu_has_dotprod = static_cast<int>(ci.CPU) & static_cast<int>(CPUTarget::DOT);
-
-    if(cpu_has_dotprod != 0)
+    if(!run_vector_matrix_multiplication)
     {
-        // Validate matrix multiply kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpAArch64V8P4Kernel::validate(a, b, output));
+        // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+        TensorShape shape_tmp_a = a->tensor_shape();
+        shape_tmp_a.set(0, a->dimension(0) * 4);
+        shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
+
+        // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+        TensorShape shape_tmp_b = b->tensor_shape();
+        shape_tmp_b.set(0, b->dimension(1) * 16);
+        shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
+
+        TensorInfo info_a(shape_tmp_a, 1, a->data_type());
+        TensorInfo info_b(shape_tmp_b, 1, b->data_type());
+
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
     }
     else
-#endif /* ARM_COMPUTE_AARCH64_V8_2 */
     {
-        if(!run_vector_matrix_multiplication)
-        {
-            // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-            TensorShape shape_tmp_a = a->tensor_shape();
-            shape_tmp_a.set(0, a->dimension(0) * 4);
-            shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
-
-            // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-            TensorShape shape_tmp_b = b->tensor_shape();
-            shape_tmp_b.set(0, b->dimension(1) * 16);
-            shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
-
-            TensorInfo info_a(shape_tmp_a, 1, a->data_type());
-            TensorInfo info_b(shape_tmp_b, 1, b->data_type());
-
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
-        }
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
     }
 
     TensorInfo info_vector_sum_col, info_vector_sum_row;
@@ -282,13 +258,24 @@
             NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
         }
 
-        if(_mtx_b_reshape_kernel)
+        if(_mtx_b_reshape_kernel && (_is_first_run || !_reshape_b_only_on_first_run))
         {
             NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
         }
     }
 
-    NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+    if(_asm_glue_unsigned._optimised_kernel != nullptr)
+    {
+        _asm_glue_unsigned.run();
+    }
+    else if(_asm_glue_signed._optimised_kernel != nullptr)
+    {
+        _asm_glue_signed.run();
+    }
+    else
+    {
+        NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+    }
 
     // Run matrix A reduction kernel only if _b_offset is not equal to 0
     if(_b_offset != 0)
@@ -297,7 +284,7 @@
     }
 
     // Run matrix B reduction kernel only if _a_offset is not equal to 0
-    if(_a_offset != 0)
+    if(_a_offset != 0 && (_is_first_run || !_reshape_b_only_on_first_run))
     {
         NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
     }
@@ -306,4 +293,6 @@
     NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
 
     _memory_group.release();
+
+    _is_first_run = false;
 }
diff --git a/src/runtime/NEON/functions/NEIm2Col.cpp b/src/runtime/NEON/functions/NEIm2Col.cpp
index b962db9..6b95cb0 100644
--- a/src/runtime/NEON/functions/NEIm2Col.cpp
+++ b/src/runtime/NEON/functions/NEIm2Col.cpp
@@ -23,19 +23,30 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEIm2Col.h"
 
-#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected)
+NEIm2Col::NEIm2Col()
+    : _kernel(), _y_dim(1)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEIm2ColKernel>();
-    k->configure(input, output, kernel_dims, conv_info, has_bias, is_fully_connected);
-    _kernel = std::move(k);
 }
 
-Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected)
+void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten)
 {
-    return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, is_fully_connected);
+    _y_dim = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+
+    _kernel.configure(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten);
+}
+
+Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten)
+{
+    return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten);
+}
+
+void NEIm2Col::run()
+{
+    NEScheduler::get().schedule(&_kernel, _y_dim);
 }
diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index fa62483..d0b80fb 100644
--- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,6 +46,26 @@
     _sumsq.allocator()->allocate();
 }
 
+Status NEL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, float epsilon)
+{
+    TensorShape shape(input->tensor_shape());
+
+    // Create intermediate tensor info
+    TensorInfo sum_sq;
+    sum_sq.set_data_type(input->data_type());
+    sum_sq.set_tensor_shape(shape);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperation::validate(input, &sum_sq, axis, ReductionOperation::SUM_SQUARE));
+
+    // Reduce shape on axis (supported axis is 0)
+    shape.set(0, 1);
+    sum_sq.set_tensor_shape(shape);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEL2NormalizeLayerKernel::validate(input, &sum_sq, output, axis, epsilon));
+
+    return Status{};
+}
+
 void NEL2NormalizeLayer::run()
 {
     _memory_group.acquire();
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index 45ddb70..913acf8 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -33,39 +33,102 @@
 
 using namespace arm_compute;
 
+namespace
+{
+void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                      TensorShape &shape_wr, TensorShape &shape_im2col, TensorShape &shape_gemm)
+{
+    ARM_COMPUTE_UNUSED(output);
+
+    const unsigned int kernel_width  = weights->dimension(0);
+    const unsigned int kernel_height = weights->dimension(1);
+
+    bool has_bias = (biases != nullptr);
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
+                                                 conv_info);
+
+    const size_t mat_weights_cols = weights->dimension(3);
+    const size_t mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + ((has_bias) ? 1 : 0);
+    const size_t mat_weights_num  = weights->dimension(4);
+
+    shape_wr = TensorShape(mat_weights_cols, mat_weights_rows, mat_weights_num);
+
+    const size_t mat_input_cols = mat_weights_rows;
+    const size_t mat_input_rows = conv_w * conv_h;
+
+    shape_im2col = input->tensor_shape();
+    shape_im2col.set(0, mat_input_cols);
+    shape_im2col.set(1, mat_input_rows);
+    shape_im2col.set(2, 1);
+
+    shape_gemm = shape_im2col;
+    shape_gemm.set(0, mat_weights_cols);
+    shape_gemm.set(1, mat_input_rows);
+}
+} // namespace
+
 NELocallyConnectedLayer::NELocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
-      _is_first_run(false)
+      _is_first_run(false), _original_weights(nullptr)
 {
 }
 
+Status NELocallyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2));
+    ARM_COMPUTE_RETURN_ERROR_ON(!conv_info.padding_is_symmetric());
+
+    bool has_bias = (biases != nullptr);
+
+    if(has_bias)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 2);
+    }
+
+    const unsigned int kernel_width  = weights->dimension(0);
+    const unsigned int kernel_height = weights->dimension(1);
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
+                                                 conv_info);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != conv_w) || (output->dimension(1) != conv_h), "Output shape does not match the expected one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+
+    // Calculate intermediate buffer shapes
+    TensorShape shape_wr;
+    TensorShape shape_im2col;
+    TensorShape shape_gemm;
+    calculate_shapes(input, weights, biases, output, conv_info, shape_wr, shape_im2col, shape_gemm);
+
+    TensorInfo weights_reshaped_info(shape_wr, 1, weights->data_type());
+    TensorInfo input_im2col_reshaped_info(shape_im2col, 1, input->data_type());
+    TensorInfo gemm_output_info(shape_gemm, 1, input->data_type());
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias, false));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, &weights_reshaped_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(NELocallyConnectedMatrixMultiplyKernel::validate(&input_im2col_reshaped_info, &weights_reshaped_info, &gemm_output_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
+
+    return Status{};
+}
+
 void NELocallyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON(!conv_info.padding_is_symmetric());
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_ERROR_THROW_ON(NELocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info));
 
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
-        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 2);
-    }
-
-    bool _has_bias = (biases != nullptr);
-    _is_first_run  = true;
-
-    // Get parameters for conv_info
-    unsigned int stride_x = 0;
-    unsigned int stride_y = 0;
-    unsigned int pad_x    = 0;
-    unsigned int pad_y    = 0;
-    std::tie(stride_x, stride_y) = conv_info.stride();
-    std::tie(pad_x, pad_y)       = conv_info.pad();
+    bool _has_bias    = (biases != nullptr);
+    _is_first_run     = true;
+    _original_weights = weights;
 
     const unsigned int kernel_width  = weights->info()->dimension(0);
     const unsigned int kernel_height = weights->info()->dimension(1);
@@ -76,32 +139,14 @@
     std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
                                                  conv_info);
 
-    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
-    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
-
-    // Create tensor to store the reshaped weights
-    const size_t mat_weights_cols = weights->info()->dimension(3);
-    const size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
-    const size_t mat_weights_num  = weights->info()->dimension(4);
-
-    const TensorShape shape_wr(mat_weights_cols, mat_weights_rows, mat_weights_num);
+    // Calculate intermediate buffer shapes
+    TensorShape shape_wr;
+    TensorShape shape_im2col;
+    TensorShape shape_gemm;
+    calculate_shapes(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info, shape_wr, shape_im2col, shape_gemm);
 
     _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
-
-    // Create tensor to store im2col reshaped inputs
-    const size_t mat_input_cols = mat_weights_rows;
-    const size_t mat_input_rows = conv_w * conv_h;
-    TensorShape  shape_im2col   = input->info()->tensor_shape();
-    shape_im2col.set(0, mat_input_cols);
-    shape_im2col.set(1, mat_input_rows);
-    shape_im2col.set(2, 1);
-
     _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
-
-    // Create locally connected layer output tensor
-    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
-    shape_gemm.set(0, mat_weights_cols);
-    shape_gemm.set(1, mat_input_rows);
     _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
 
     // Manage intermediate buffers
@@ -125,8 +170,13 @@
     // Run weights reshaping (Runs once for every configure)
     if(_is_first_run)
     {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
         _is_first_run = false;
         NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
     }
 
     _memory_group.acquire();
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index 5a474e4..cf6b984 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 
+#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
 #include "support/ToolchainSupport.h"
 
@@ -30,11 +31,21 @@
 
 using namespace arm_compute;
 
-void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+void NEPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEPixelWiseMultiplicationKernel>();
     k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
     _kernel = std::move(k);
+
+    if(output->info()->dimension(0) > 1)
+    {
+        ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+        if(broadcasted_info->info()->dimension(0) == 1)
+        {
+            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+        }
+    }
 }
 Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index bc0b6f8..cbfd684 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -31,7 +31,7 @@
 using namespace arm_compute;
 
 NEPoolingLayer::NEPoolingLayer()
-    : _pooling_layer_kernel(), _border_handler(), _is_global_pooling_layer(false)
+    : _pooling_layer_kernel(), _border_handler(), _is_global_pooling_layer(false), _data_layout(DataLayout::NCHW)
 {
 }
 
@@ -40,17 +40,31 @@
     // Check if we have Global Pooling Layer
     _is_global_pooling_layer = (input->info()->dimension(0) == pool_info.pool_size().width) && (input->info()->dimension(1) == pool_info.pool_size().height);
 
+    // Get data layout
+    _data_layout = input->info()->data_layout();
+
     // Configure pooling kernel
     _pooling_layer_kernel.configure(input, output, pool_info);
 
-    // Configure border depending on operation required (quantize border in case of asymmetric data_type)
-    BorderMode border_mode = (pool_info.pool_type() == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-    PixelValue zero_value(0.f);
-    if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding())
+    switch(_data_layout)
     {
-        zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+        case DataLayout::NCHW:
+        {
+            // Configure border depending on operation required (quantize border in case of asymmetric data_type)
+            BorderMode border_mode = (pool_info.pool_type() == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+            PixelValue zero_value(0.f);
+            if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding())
+            {
+                zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+            }
+            _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, zero_value);
+            break;
+        }
+        case DataLayout::NHWC:
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data layout not supported");
     }
-    _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, zero_value);
 }
 
 Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
@@ -60,9 +74,20 @@
 
 void NEPoolingLayer::run()
 {
-    // Fill border
-    NEScheduler::get().schedule(&_border_handler, Window::DimY);
+    switch(_data_layout)
+    {
+        case DataLayout::NCHW:
+            // Fill border
+            NEScheduler::get().schedule(&_border_handler, Window::DimY);
 
-    // Run pooling layer
-    NEScheduler::get().schedule(&_pooling_layer_kernel, _is_global_pooling_layer ? Window::DimZ : Window::DimY);
+            // Run pooling layer
+            NEScheduler::get().schedule(&_pooling_layer_kernel, _is_global_pooling_layer ? Window::DimZ : Window::DimY);
+            break;
+        case DataLayout::NHWC:
+            // Run pooling layer
+            NEScheduler::get().schedule(&_pooling_layer_kernel, Window::DimX);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data layout not supported");
+    }
 }
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
index a131c48..8f7db96 100644
--- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 using namespace arm_compute;
@@ -34,8 +35,21 @@
 {
 }
 
+Status NEQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+    TensorInfo min_max{ input->num_channels(), input->data_type() };
+    ARM_COMPUTE_RETURN_ON_ERROR(NEMinMaxLayerKernel::validate(input, &min_max));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayerKernel::validate(input, output, &min_max));
+
+    return Status{};
+}
+
 void NEQuantizationLayer::configure(const ITensor *input, ITensor *output)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
     // Configure min-max kernel. _min_max tensor will be auto-configured within the kernel
     _min_max_kernel.configure(input, &_min_max);
 
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index f1a9145..cd0b42f 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,6 +63,13 @@
 {
 }
 
+Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output, axis, op));
+
+    return Status{};
+}
+
 void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index bd565c9..a9c85bd 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,7 +45,6 @@
 void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size, SamplingPolicy sampling_policy)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == offsets);
-    ARM_COMPUTE_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
     ARM_COMPUTE_UNUSED(sampling_policy);
 
     Window win;
@@ -66,7 +65,7 @@
             const int   in_xi = std::floor(in_x);
             const int   in_yi = std::floor(in_y);
 
-            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size;
+            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * static_cast<int>(input_element_size);
             *reinterpret_cast<float *>(dx_it.ptr())        = in_x - in_xi;
             *reinterpret_cast<float *>(dy_it.ptr())        = in_y - in_yi;
         },
@@ -99,20 +98,20 @@
 
 void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
 {
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(NEScale::validate(input->info(), output->info(), policy, border_mode, constant_border_value, sampling_policy));
 
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
-    }
+    // Get data layout and width/height indices
+    const DataLayout data_layout = input->info()->data_layout();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     // Get the tensor shape
-    const TensorShape shape(output->info()->dimension(0), output->info()->dimension(1));
+    const TensorShape shape(output->info()->dimension(idx_width), output->info()->dimension(idx_height));
 
     // Compute the ratio between source width/height and destination width/height
-    const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
-    const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
+    const auto wr = static_cast<float>(input->info()->dimension(idx_width)) / static_cast<float>(output->info()->dimension(idx_width));
+    const auto hr = static_cast<float>(input->info()->dimension(idx_height)) / static_cast<float>(output->info()->dimension(idx_height));
 
     // Get the element size of the input image
     const size_t input_element_size = input->info()->element_size();
@@ -123,9 +122,6 @@
         policy = InterpolationPolicy::NEAREST_NEIGHBOR;
     }
 
-    // Check if the border mode is UNDEFINED
-    const bool border_undefined = border_mode == BorderMode::UNDEFINED;
-
     switch(policy)
     {
         case InterpolationPolicy::NEAREST_NEIGHBOR:
@@ -133,7 +129,7 @@
             TensorInfo tensor_info_offsets(shape, Format::S32);
             _offsets.allocator()->init(tensor_info_offsets);
 
-            _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined, sampling_policy);
+            _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_mode, sampling_policy);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
@@ -151,7 +147,7 @@
             _dx.allocator()->init(tensor_info_dxdy);
             _dy.allocator()->init(tensor_info_dxdy);
 
-            _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_undefined, sampling_policy);
+            _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_mode, sampling_policy);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
@@ -164,7 +160,7 @@
         }
         case InterpolationPolicy::AREA:
         {
-            _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, policy, border_undefined);
+            _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, policy, border_mode);
             break;
         }
         default:
@@ -174,6 +170,48 @@
     _border_handler.configure(input, _scale_kernel.border_size(), border_mode, PixelValue(constant_border_value));
 }
 
+Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy,
+                         BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
+    ARM_COMPUTE_UNUSED(border_mode, constant_border_value);
+
+    ITensorInfo *offsets = nullptr;
+    ITensorInfo *dx      = nullptr;
+    ITensorInfo *dy      = nullptr;
+
+    // Get data layout and width/height indices
+    const DataLayout data_layout = input->data_layout();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    // Get the tensor shape of auxilary buffers
+    const TensorShape shape(output->dimension(idx_width), output->dimension(idx_height));
+
+    TensorInfo tensor_info_offsets(shape, Format::S32);
+    TensorInfo tensor_info_dx(shape, Format::F32);
+    TensorInfo tensor_info_dy(shape, Format::F32);
+
+    switch(policy)
+    {
+        case InterpolationPolicy::NEAREST_NEIGHBOR:
+            offsets = &tensor_info_offsets;
+            break;
+        case InterpolationPolicy::BILINEAR:
+            offsets = &tensor_info_offsets;
+            dx      = &tensor_info_dx;
+            dy      = &tensor_info_dy;
+            break;
+        default:
+            break;
+    }
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEScaleKernel::validate(input->clone().get(), dx, dy, offsets, output->clone().get(),
+                                                        policy, border_mode, sampling_policy));
+    return Status{};
+}
+
 void NEScale::run()
 {
     NEScheduler::get().schedule(&_border_handler, Window::DimZ);
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
new file mode 100644
index 0000000..8f2c4c4
--- /dev/null
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/AssemblyHelper.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
+
+#include "arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
+
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+
+namespace arm_compute
+{
+namespace
+{
+inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input)
+{
+    const DataLayout data_layout = input->info()->data_layout();
+    const int        in_width    = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
+    const int        in_height   = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
+    const int        in_channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+    const int        in_batches  = input->info()->dimension(3);
+
+    return Tensor4DShape({ in_batches, in_height, in_width, in_channels });
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
+{
+    const DataLayout   data_layout = input->data_layout();
+    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    ARM_COMPUTE_UNUSED(output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW); // COMPMID-1162
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 3 && weights->dimension(height_idx) != 5, "Only 3 and 5 kernels are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides.");
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    return Status{};
+}
+
+Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims)
+{
+    Size2D output_tile = Size2D{};
+
+    if(kernel_dims == Size2D(3U, 3U))
+    {
+        output_tile = (input_dims.width <= 4 && input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
+    }
+    else if(kernel_dims == Size2D(5U, 5U))
+    {
+        output_tile = Size2D(2U, 2U);
+    }
+
+    return output_tile;
+}
+
+bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size)
+{
+    // Check if we want to configure a Winograd configuration which requires fast math
+    using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
+
+    std::vector<WinogradConfiguration> fast_math_winograd =
+    {
+        WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(5, 5)),
+        WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
+    };
+
+    auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
+                            std::pair<int, int>(kernel_size.width, kernel_size.height));
+
+    return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
+}
+} //namespace
+
+NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _arm_gemm(nullptr), _gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr),
+      _activationlayer_function(), _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(),
+      _workspace(), _input(), _weights(), _output(), _reshaped_kernel(false), _is_activationlayer_enabled(false)
+{
+} /* arm_compute */
+
+void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
+                                           bool enable_fast_math)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info));
+
+    // Get indices for the width and height
+    const DataLayout   data_layout = input->info()->data_layout();
+    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    const Size2D input_dims  = Size2D(input->info()->dimension(width_idx), input->info()->dimension(height_idx));
+    const Size2D kernel_size = Size2D(weights->info()->dimension(width_idx), weights->info()->dimension(height_idx));
+    const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
+
+    // Check if the Winograd configuration requires fast math
+    if(!enable_fast_math)
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+    }
+
+    _weights = weights;
+    _input   = input;
+    _output  = output;
+
+    std::unique_ptr<INEWinogradLayerTransformInputKernel<float>>   transform_input_kernel;
+    std::unique_ptr<INEWinogradLayerTransformWeightsKernel<float>> transform_weights_kernel;
+    std::unique_ptr<INEWinogradLayerTransformOutputKernel<float>>  transform_output_kernel;
+
+    int n_gemms = 0;
+    int N_BLOCK = 0; // Size of block used by GEMM.
+
+    switch(kernel_size.width)
+    {
+        case 3:
+        {
+            if(input->info()->dimension(width_idx) > 4 && input->info()->dimension(height_idx) > 4)
+            {
+                transform_input_kernel   = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>>();
+                transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>>();
+                transform_output_kernel  = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>>();
+                n_gemms                  = NEWinogradLayerBatchedGEMMKernel<float, float, 4, 4, 3, 3>::WinogradBase::N_GEMMS;
+                N_BLOCK                  = NEWinogradLayerBatchedGEMMKernel<float, float, 4, 4, 3, 3>::WinogradConv::N_BLOCK;
+            }
+            else
+            {
+                transform_input_kernel   = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>>();
+                transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>>();
+                transform_output_kernel  = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>>();
+                n_gemms                  = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradBase::N_GEMMS;
+                N_BLOCK                  = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradConv::N_BLOCK;
+            }
+            break;
+        }
+        case 5:
+        {
+            transform_input_kernel   = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>>();
+            transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>>();
+            transform_output_kernel  = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>>();
+            n_gemms                  = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradBase::N_GEMMS;
+            N_BLOCK                  = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradConv::N_BLOCK;
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Not supported.");
+            break;
+        }
+    }
+
+    const PaddingType use_padding_type = (conv_info.pad_left() != 0u) ? PADDING_SAME : PADDING_VALID;
+    const bool        use_same_padding = use_padding_type == PADDING_SAME;
+
+    // Get convolved dimensions
+    const int in_channels  = input->info()->dimension(channel_idx);
+    const int out_channels = output->info()->dimension(channel_idx);
+
+    const Tensor4DShape in_shape(internal_get_input_shape(input));
+    const size_t        data_type_size = input->info()->element_size();
+    // Get the memory required to instantiate a new Winograd operator.
+    constexpr size_t storage_alignment   = 64;
+    const size_t     kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size;
+    _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_size + storage_alignment - 1) }, 1, DataType::U8));
+    _kernel_storage.allocator()->allocate();
+    // Input storage
+    const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size;
+    _input_workspace.allocator()->init(TensorInfo(TensorShape{ (input_storage_size + storage_alignment - 1) }, 1, DataType::U8));
+    _input_workspace.allocator()->allocate();
+
+    // Output storage
+    const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, use_same_padding) * data_type_size;
+    _output_workspace.allocator()->init(TensorInfo(TensorShape{ (output_storage_size + storage_alignment - 1) }, 1, DataType::U8));
+    _output_workspace.allocator()->allocate();
+
+    // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
+    TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
+                                _output->info()->dimension(1), _output->info()->dimension(3)),
+                    1, _output->info()->data_type());
+    _output_nhwc.allocator()->init(info);
+    _output_nhwc.allocator()->allocate();
+
+    // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
+    _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
+    _weights_hwio.allocator()->allocate();
+
+    // configure the kernel to transform the input tensor from NCHW -> NHWC
+    _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
+    _input_nhwc.allocator()->allocate();
+
+    const KernelShape kernel_shape({ out_channels, static_cast<int>(kernel_size.height), static_cast<int>(kernel_size.width), in_channels });
+
+    // Configure the InputTransform
+    const int input_matrix_stride = transform_input_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
+    transform_input_kernel->configure(reinterpret_cast<float *>(_input_nhwc.buffer()), in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
+                                      reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_stride);
+
+    // Configure WeightsTransform
+    const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
+    transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast<float *>(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels);
+
+    // Configure OutputTransform
+    //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
+    const int  output_matrix_stride = transform_output_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
+    const auto output_shape(transform_output_kernel->get_output_shape(kernel_shape, in_shape, use_padding_type));
+
+    transform_output_kernel->configure(biases, reinterpret_cast<float *>(_output_workspace.buffer()),
+                                       output_matrix_stride, reinterpret_cast<float *>(_output_nhwc.buffer()),
+                                       in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
+
+    // Configure GEMM
+    const int    tile_rows                = iceildiv(output_shape.n_rows, output_tile.height);
+    const int    tile_cols                = iceildiv(output_shape.n_cols, output_tile.width);
+    const int    m                        = in_shape.n_batches * tile_rows * tile_cols;
+    const int    k                        = in_shape.n_channels;
+    const int    n                        = out_channels;
+    const int    input_matrix_row_stride  = in_shape.n_channels;
+    const int    kernel_matrix_row_stride = roundup(out_channels, N_BLOCK);
+    const int    output_matrix_row_stride = kernel_matrix_row_stride;
+    unsigned int num_threads              = NEScheduler::get().num_threads();
+
+    _arm_gemm = arm_gemm::gemm<float, float>(NEScheduler::get().cpu_info(), m, n, k, 1, n_gemms, false, false, 1.f, 0.f, num_threads, false);
+    _arm_gemm->set_arrays(reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_row_stride, 0, input_matrix_stride, reinterpret_cast<float *>(_kernel_storage.buffer()),
+                          kernel_matrix_row_stride, kernel_matrix_stride, reinterpret_cast<float *>(_output_workspace.buffer()), output_matrix_row_stride, 0, output_matrix_stride);
+
+    auto acl_gemm_wrapper = support::cpp14::make_unique<NEGEMMAssemblyWrapper<arm_gemm::GemmCommon<float, float>>>();
+    acl_gemm_wrapper->configure(_arm_gemm.get());
+    const size_t workspace_size = _arm_gemm->get_working_size();
+
+    // Allocate workspace
+    if(workspace_size > 0)
+    {
+        const unsigned int alignment = 4096;
+        allocate_workspace(workspace_size, _workspace, &_memory_group, alignment, 1);
+        _arm_gemm->set_working_space(reinterpret_cast<float *>(_workspace.buffer()));
+    }
+
+    const unsigned int window_size = _arm_gemm->get_window_size();
+    if(window_size < num_threads)
+    {
+        num_threads = window_size;
+        _arm_gemm->set_nthreads(num_threads);
+    }
+
+    _gemm_kernel = std::move(acl_gemm_wrapper);
+
+    // Reorder the convoluted output to ACL's ordering NCHW
+    _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
+
+    _transform_input_kernel   = std::move(transform_input_kernel);
+    _transform_weights_kernel = std::move(transform_weights_kernel);
+    _transform_output_kernel  = std::move(transform_output_kernel);
+
+    //Configure Activation Layer
+    _is_activationlayer_enabled = act_info.enabled();
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.configure(output, nullptr, act_info);
+    }
+}
+
+void NEWinogradConvolutionLayer::run()
+{
+    _memory_group.acquire();
+    if(!_reshaped_kernel)
+    {
+        _reshaped_kernel = true;
+        _permute_weights.run();
+        NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
+    }
+    //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
+    _permute_input.run();
+
+    // Transform input tensor to the winograd domain
+    NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX);
+
+    //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
+    NEScheduler::get().schedule(_gemm_kernel.get(), Window::DimX);
+
+    // Transform output tensor to the spatial domain
+    NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
+
+    // Reorder the convoluted output to ACL's ordering NCHW
+    _permute_output.run();
+
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.run();
+    }
+    _memory_group.release();
+}
+
+Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                            const ActivationLayerInfo &act_info, bool enable_fast_math)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info));
+
+    // Get indices for the width and height
+    const size_t idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+    // Input shape, kernel size and output tile
+    const Size2D input_dims  = Size2D(input->dimension(idx_width), input->dimension(idx_height));
+    const Size2D kernel_size = Size2D(weights->dimension(idx_width), weights->dimension(idx_height));
+    const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
+
+    // Check if the Winograd configuration requires fast math
+    if(!enable_fast_math)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+    }
+
+    const WinogradInfo winograd_info = WinogradInfo(output_tile,
+                                                    kernel_size,
+                                                    input_dims,
+                                                    conv_info,
+                                                    input->data_layout());
+
+    // Validate input transform
+    const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
+    const TensorInfo  input0       = input->clone()->set_tensor_shape(input0_shape);
+    switch(weights->dimension(idx_width))
+    {
+        case 3:
+        {
+            if(input_dims.width > 4 && input_dims.height > 4)
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>::validate(input, &input0, winograd_info)));
+            }
+            else
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>::validate(input, &input0, winograd_info)));
+            }
+            break;
+        }
+        case 5:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>::validate(input, &input0, winograd_info)));
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
+            break;
+        }
+    }
+    // Validate filter transform
+    const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
+    const TensorInfo  input1       = weights->clone()->set_tensor_shape(input1_shape);
+
+    switch(weights->dimension(idx_width))
+    {
+        case 3:
+        {
+            if(input_dims.width > 4 && input_dims.height > 4)
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>::validate(weights, &input1, winograd_info)));
+            }
+            else
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>::validate(weights, &input1, winograd_info)));
+            }
+            break;
+        }
+        case 5:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>::validate(weights, &input1, winograd_info)));
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
+            break;
+        }
+    }
+    // Validate batched matrix multiply
+    TensorShape batched_mm_output_shape = input0.tensor_shape();
+    batched_mm_output_shape[0]          = input1.tensor_shape()[0];
+    const TensorInfo batched_mm_output  = input0.clone()->set_tensor_shape(batched_mm_output_shape);
+    switch(weights->dimension(idx_width))
+    {
+        case 3:
+        {
+            if(input_dims.width > 4 && input_dims.height > 4)
+            {
+                // Validate output transform
+                ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>::validate(&batched_mm_output, biases, output, winograd_info)));
+            }
+            else
+            {
+                // Validate output transform
+                ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>::validate(&batched_mm_output, biases, output, winograd_info)));
+            }
+            break;
+        }
+        case 5:
+        {
+            // Validate output transform
+            ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>::validate(&batched_mm_output, biases, output, winograd_info)));
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
+            break;
+        }
+    }
+
+    // Validate Activation Layer
+    if(act_info.enabled())
+    {
+        NEActivationLayer::validate(output, nullptr, act_info);
+    }
+    return Status{};
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEWinogradLayer.cpp b/src/runtime/NEON/functions/NEWinogradLayer.cpp
deleted file mode 100644
index 0ac6d09..0000000
--- a/src/runtime/NEON/functions/NEWinogradLayer.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEWinogradLayer.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "support/ToolchainSupport.h"
-
-#include "arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h"
-
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-
-namespace
-{
-inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input)
-{
-    const int in_width    = input->info()->dimension(0);
-    const int in_height   = input->info()->dimension(1);
-    const int in_batches  = input->info()->dimension(3);
-    const int in_channels = input->info()->dimension(2);
-    return Tensor4DShape({ in_batches, in_height, in_width, in_channels });
-}
-} /* namespace */
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, biases);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != 3 && weights->dimension(0) != 5, "Only 3 and 5 kernels are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    }
-
-    // Get parameters from conv_info
-    unsigned int stride_x = 0;
-    unsigned int stride_y = 0;
-    std::tie(stride_x, stride_y) = conv_info.stride();
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(stride_y != 1 || stride_x != 1, "Winograd layer only supports unit strides.");
-
-    ARM_COMPUTE_UNUSED(output);
-
-    return Status{};
-}
-} //namespace
-
-NEWinogradLayer::NEWinogradLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _batched_gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _permute_input(),
-      _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(), _input(), _weights(), _output(),
-      _reshaped_kernel(false)
-{
-} /* arm_compute */
-
-void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, biases, output);
-    ARM_COMPUTE_UNUSED(conv_info);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), biases->info(), output->info(), conv_info));
-
-    _weights = weights;
-    _input   = input;
-    _output  = output;
-
-    std::unique_ptr<INEWinogradLayerBatchedGEMMKernel<float, float>> batched_gemm_kernel;
-    std::unique_ptr<INEWinogradLayerTransformInputKernel<float>>   transform_input_kernel;
-    std::unique_ptr<INEWinogradLayerTransformWeightsKernel<float>> transform_weights_kernel;
-    std::unique_ptr<INEWinogradLayerTransformOutputKernel<float>>  transform_output_kernel;
-
-    switch(weights->info()->dimension(0))
-    {
-        case 3:
-        {
-            batched_gemm_kernel      = support::cpp14::make_unique<NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>>();
-            transform_input_kernel   = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>>();
-            transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>>();
-            transform_output_kernel  = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>>();
-            break;
-        }
-        case 5:
-        {
-            batched_gemm_kernel      = support::cpp14::make_unique<NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>>();
-            transform_input_kernel   = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>>();
-            transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>>();
-            transform_output_kernel  = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>>();
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Not supported.");
-            break;
-        }
-    }
-
-    const PaddingType use_padding_type = (conv_info.pad_left() != 0u) ? PADDING_SAME : PADDING_VALID;
-    const bool        use_same_padding = use_padding_type == PADDING_SAME;
-
-    // Get parameters from conv_info
-    unsigned int stride_x = 0;
-    unsigned int stride_y = 0;
-    std::tie(stride_x, stride_y) = conv_info.stride();
-    ARM_COMPUTE_ERROR_ON_MSG(stride_y != 1 || stride_x != 1, "Winograd layer only supports unit strides.");
-
-    // Get convolved dimensions
-    const int in_channels  = input->info()->dimension(2);
-    const int out_channels = output->info()->dimension(2);
-
-    const Tensor4DShape in_shape(internal_get_input_shape(input));
-    const size_t        data_type_size = input->info()->element_size();
-    // Get the memory required to instantiate a new Winograd operator.
-    constexpr size_t storage_alignment   = 64;
-    const size_t     kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size;
-    _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_size + storage_alignment - 1) }, 1, DataType::U8));
-    _kernel_storage.allocator()->allocate();
-    // Input storage
-    const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size;
-    _input_workspace.allocator()->init(TensorInfo(TensorShape{ (input_storage_size + storage_alignment - 1) }, 1, DataType::U8));
-    _input_workspace.allocator()->allocate();
-
-    // Output storage
-    const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, use_same_padding) * data_type_size;
-    _output_workspace.allocator()->init(TensorInfo(TensorShape{ (output_storage_size + storage_alignment - 1) }, 1, DataType::U8));
-    _output_workspace.allocator()->allocate();
-
-    // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
-    TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
-                                _output->info()->dimension(1), _output->info()->dimension(3)),
-                    1, _output->info()->data_type());
-    _output_nhwc.allocator()->init(info);
-    _output_nhwc.allocator()->allocate();
-
-    // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
-    _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
-    _weights_hwio.allocator()->allocate();
-
-    // configure the kernel to transform the input tensor from NCHW -> NHWC
-    _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
-    _input_nhwc.allocator()->allocate();
-
-    const int         weights_width  = weights->info()->dimension(0);
-    const int         weights_height = weights->info()->dimension(1);
-    const KernelShape kernel_shape({ out_channels, weights_height, weights_width, in_channels });
-
-    // Configure the InputTransform
-    const int input_matrix_stride = transform_input_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
-    transform_input_kernel->configure(reinterpret_cast<float *>(_input_nhwc.buffer()), in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
-                                      reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_stride);
-
-    // Configure WeightsTransform
-    const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
-    transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast<float *>(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels);
-
-    // Configure OutputTransform
-    //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
-    const int  output_matrix_stride = transform_output_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
-    const auto output_shape(transform_output_kernel->get_output_shape(kernel_shape, in_shape, use_padding_type));
-
-    transform_output_kernel->configure(biases, reinterpret_cast<float *>(_output_workspace.buffer()),
-                                       output_matrix_stride, reinterpret_cast<float *>(_output_nhwc.buffer()),
-                                       in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
-
-    // Configure Batched GEMMs
-    const int      output_tile_rows         = batched_gemm_kernel->get_output_tile_rows();
-    const int      output_tile_cols         = batched_gemm_kernel->get_output_tile_cols();
-    const int      n_block                  = batched_gemm_kernel->get_number_blocks();
-    const int      tile_rows                = iceildiv(output_shape.n_rows, output_tile_rows);
-    const int      tile_cols                = iceildiv(output_shape.n_cols, output_tile_cols);
-    const int      m                        = in_shape.n_batches * tile_rows * tile_cols;
-    const int      k                        = in_shape.n_channels;
-    const int      n                        = out_channels;
-    const int      input_matrix_row_stride  = in_shape.n_channels;
-    const int      kernel_matrix_row_stride = roundup(out_channels, n_block);
-    const int      output_matrix_row_stride = kernel_matrix_row_stride;
-    const unsigned n_gemms                  = batched_gemm_kernel->get_number_gemms();
-
-    batched_gemm_kernel->configure(n_gemms, m, k, n,
-                                   input_matrix_stride, input_matrix_row_stride,
-                                   kernel_matrix_stride, kernel_matrix_row_stride,
-                                   output_matrix_stride, output_matrix_row_stride,
-                                   reinterpret_cast<float *>(_input_workspace.buffer()),
-                                   reinterpret_cast<float *>(_kernel_storage.buffer()),
-                                   reinterpret_cast<float *>(_output_workspace.buffer()));
-
-    // Reorder the convoluted output to ACL's ordering NCHW
-    _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
-
-    _transform_input_kernel   = std::move(transform_input_kernel);
-    _transform_weights_kernel = std::move(transform_weights_kernel);
-    _transform_output_kernel  = std::move(transform_output_kernel);
-    _batched_gemm_kernel      = std::move(batched_gemm_kernel);
-}
-
-void NEWinogradLayer::run()
-{
-    _memory_group.acquire();
-    if(!_reshaped_kernel)
-    {
-        _reshaped_kernel = true;
-        _permute_weights.run();
-        NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
-    }
-    //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
-    _permute_input.run();
-
-    // Transform input tensor to the winograd domain
-    NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX);
-
-    //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
-    NEScheduler::get().schedule(_batched_gemm_kernel.get(), Window::DimX);
-
-    // Transform output tensor to the spatial domain
-    NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
-
-    // Reorder the convoluted output to ACL's ordering NCHW
-    _permute_output.run();
-    _memory_group.release();
-}
-
-Status NEWinogradLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, biases, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(validate_arguments(input, weights, biases, output, conv_info));
-
-    return Status{};
-}
-
-} // namespace arm_compute