arm_compute v17.12
diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
index 57a1738..cdf1b54 100644
--- a/src/runtime/NEON/functions/NEActivationLayer.cpp
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp
@@ -34,3 +34,8 @@
     k->configure(input, output, activation_info);
     _kernel = std::move(k);
 }
+
+Status NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    return NEActivationLayerKernel::validate(input, output, act_info);
+}
diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
index 11f5aa7..b5dd4d0 100644
--- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
@@ -36,3 +36,7 @@
     k->configure(input1, input2, output, policy);
     _kernel = std::move(k);
 }
+Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+    return NEArithmeticAdditionKernel::validate(input1, input2, output, policy);
+}
diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
index 37586af..5c0491e 100644
--- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
@@ -36,3 +36,7 @@
     k->configure(input1, input2, output, policy);
     _kernel = std::move(k);
 }
+Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+    return NEArithmeticSubtractionKernel::validate(input1, input2, output, policy);
+}
diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
index ef79b02..f6be001 100644
--- a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
@@ -43,6 +43,12 @@
     _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
 }
 
+Status NEBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta, const ITensorInfo *gamma,
+                                           float epsilon)
+{
+    return NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon);
+}
+
 void NEBatchNormalizationLayer::run()
 {
     NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/NEON/functions/NECol2Im.cpp
similarity index 68%
copy from src/runtime/NEON/functions/NEDepthConvert.cpp
copy to src/runtime/NEON/functions/NECol2Im.cpp
index 37857b6..78c6bc0 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/NEON/functions/NECol2Im.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NECol2Im.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
 #include "support/ToolchainSupport.h"
 
-#include <utility>
-
 using namespace arm_compute;
 
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void NECol2Im::configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
-    k->configure(input, output, policy, shift);
+    auto k = arm_compute::support::cpp14::make_unique<NECol2ImKernel>();
+    k->configure(input, output, convolved_dims);
     _kernel = std::move(k);
 }
+
+Status NECol2Im::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims)
+{
+    return NECol2ImKernel::validate(input, output, convolved_dims);
+}
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index f34f497..25c639f 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -136,10 +136,7 @@
     // Get parameters from conv_info
     unsigned int stride_x = 0;
     unsigned int stride_y = 0;
-    unsigned int pad_x    = 0;
-    unsigned int pad_y    = 0;
     std::tie(stride_x, stride_y) = conv_info.stride();
-    std::tie(pad_x, pad_y)       = conv_info.pad();
 
     // Get convolved dimensions
     unsigned int conv_w = 0;
@@ -190,9 +187,17 @@
     {
         if(_are_weights_reshaped)
         {
-            const unsigned int transpose_width = 16 / input->info()->element_size();
-            mat_weights_cols                   = weights_info.num_kernels();
-            mat_weights_rows                   = weights->info()->dimension(0) / transpose_width + (_has_bias ? 1 : 0);
+            if(_is_fully_connected_convolution)
+            {
+                mat_weights_cols = weights_info.num_kernels();
+                mat_weights_rows = weights->info()->dimension(1);
+            }
+            else
+            {
+                const unsigned int transpose_width = 16 / input->info()->element_size();
+                mat_weights_cols                   = weights_info.num_kernels();
+                mat_weights_rows                   = weights->info()->dimension(0) / transpose_width + (_has_bias ? 1 : 0);
+            }
         }
         else
         {
@@ -270,7 +275,7 @@
         // Configure matrix multiplication kernel
         if(_is_fully_connected_convolution)
         {
-            _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace, 1.f, 0.f, false, false);
+            _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace, 1.f, 0.f);
         }
         else
         {
@@ -295,7 +300,7 @@
     }
 
     _input_im2col_reshaped.allocator()->allocate();
-    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+    _output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h));
     _gemm_output.allocator()->allocate();
 
     ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
new file mode 100644
index 0000000..7b4e77b
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _scale_f(),
+      _conv_f(),
+      _scaled_output()
+{
+}
+
+void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info,
+                                     unsigned int ax, unsigned int ay, float upscalex, float upscaley)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) < 1);
+
+    auto out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
+                                                    info.pad().first, info.pad().second, ax, ay, upscalex, upscaley, info.round());
+
+    const TensorShape output_shape = deconvolution_output_shape(out_dims, input->info()->tensor_shape(), weights->info()->tensor_shape());
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights, bias);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, weights, bias);
+
+    ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
+    ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
+    ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
+
+    _memory_group.manage(&_scaled_output);
+
+    // configure scale function
+    //Init and allocate intermmidiate tensor for output, same size as input but the first two axis are the same as the output tensor
+    TensorShape scale_out_shape(input->info()->tensor_shape());
+    scale_out_shape.set(0, output->info()->dimension(0));
+    scale_out_shape.set(1, output->info()->dimension(1));
+    TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    _scaled_output.allocator()->init(scale_out_info);
+    const unsigned int kernel_size = weights->info()->dimension(0);
+    // Padding for the upsampled image is calculated with the equiation: p' = k - p - 1, where k is kernel size and p is the input padding
+    ARM_COMPUTE_ERROR_ON(info.pad().first > (kernel_size - 1));
+    const unsigned int  tr_px     = kernel_size - info.pad().first - 1;
+    const unsigned int  tr_py     = kernel_size - info.pad().second - 1;
+    const unsigned int  tr_stride = 1;
+    const PadStrideInfo transposed_info(tr_stride, tr_stride, tr_px, tr_py);
+    _scale_f.configure(input, &_scaled_output, std::make_pair(ax, ay), std::make_pair(info.stride().first - 1u, info.stride().second - 1u), transposed_info);
+    // setup the function to convolve the upscaled output
+    switch(kernel_size)
+    {
+        case 1:
+        {
+            _conv_f.configure(&_scaled_output, weights, bias, output, PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL));
+            break;
+        }
+        case 3:
+        {
+            _conv_f.configure(&_scaled_output, weights, bias, output, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL));
+            break;
+        }
+        case 5:
+        {
+            _conv_f.configure(&_scaled_output, weights, bias, output, PadStrideInfo(1, 1, 2, 2, DimensionRoundingType::CEIL));
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+        }
+    }
+    _scaled_output.allocator()->allocate();
+}
+
+void NEDeconvolutionLayer::run()
+{
+    _memory_group.acquire();
+    _scale_f.run();
+    _conv_f.run();
+    _memory_group.release();
+}
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayerUpsample.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayerUpsample.cpp
new file mode 100644
index 0000000..79b9b2d
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayerUpsample.cpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDeconvolutionLayerUpsample.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
+
+#include <cmath>
+#include <cstddef>
+#include <utility>
+
+using namespace arm_compute;
+
+namespace
+{
+inline void precompute_offsets(ITensor *offsets, float wr, size_t input_element_size, const std::pair<unsigned int, unsigned int> &a,
+                               const std::pair<unsigned int, unsigned int> &iz, const PadStrideInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == offsets);
+    Window    win;
+    const int padx          = info.pad().first;
+    const int pady          = info.pad().second;
+    const int ax            = a.first;
+    const int ay            = a.second;
+    const int offset_width  = offsets->info()->dimension(0);
+    const int offset_height = offsets->info()->dimension(1);
+    // The values of ax and ay denote the number of ZEROS to be added on the top and right inner border of the image.
+    // Step value along the XY axis will depend on the number of zeros to be inserted between samples (number of zeros + 1).
+    // Pre-compute the X offset, Y's stride is unknown at this point so we can't precompute Y's offsets
+    for(int yi = ay; yi < (offset_height - pady); yi += (1 + iz.second))
+    {
+        for(int xi = padx; xi < (offset_width - ax); xi += (1 + iz.first))
+        {
+            int         *ptr                  = reinterpret_cast<int *>(offsets->ptr_to_element(Coordinates(xi, yi)));
+            const size_t in_xi                = (xi + 0.5f) * wr;
+            *reinterpret_cast<int32_t *>(ptr) = in_xi * input_element_size;
+        }
+    }
+}
+} // namespace
+
+NEDeconvolutionLayerUpsample::NEDeconvolutionLayerUpsample(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _offsets(),
+      _border_handler(),
+      _upsample()
+{
+}
+
+void NEDeconvolutionLayerUpsample::configure(ITensor *input, ITensor *output, const std::pair<unsigned int, unsigned int> &a,
+                                             const std::pair<unsigned int, unsigned int> &iz, const PadStrideInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
+    }
+
+    // Get the tensor shape
+    const TensorShape shape(output->info()->dimension(0), output->info()->dimension(1));
+
+    // Compute the ratio between source width/height and destination width/height
+    const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
+    const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
+    ARM_COMPUTE_UNUSED(hr);
+    // Get the element size of the input image
+    const size_t input_element_size = input->info()->element_size();
+
+    TensorInfo tensor_info_offsets(shape, Format::S32);
+    _offsets.allocator()->init(tensor_info_offsets);
+
+    _upsample.configure(input, &_offsets, output);
+
+    // Allocate once the configure methods have been called
+    _offsets.allocator()->allocate();
+    // Pre-compute offsets for nearest interpolation
+    std::fill_n(reinterpret_cast<int32_t *>(_offsets.buffer()), _offsets.info()->total_size() / sizeof(int32_t), -1 * input_element_size);
+    precompute_offsets(&_offsets, wr, input_element_size, a, iz, info);
+
+    _border_handler.configure(input, _upsample.border_size(), BorderMode::CONSTANT, PixelValue(0.f));
+}
+
+void NEDeconvolutionLayerUpsample::run()
+{
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    _memory_group.acquire();
+    NEScheduler::get().schedule(&_upsample, Window::DimY);
+    _memory_group.release();
+}
diff --git a/src/runtime/NEON/functions/NEDepthConcatenate.cpp b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
similarity index 89%
rename from src/runtime/NEON/functions/NEDepthConcatenate.cpp
rename to src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
index f8ad2ab..437c941 100644
--- a/src/runtime/NEON/functions/NEDepthConcatenate.cpp
+++ b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -33,7 +33,7 @@
 
 using namespace arm_compute;
 
-NEDepthConcatenate::NEDepthConcatenate() // NOLINT
+NEDepthConcatenateLayer::NEDepthConcatenateLayer() // NOLINT
     : _inputs_vector(),
       _concat_kernels_vector(),
       _border_handlers_vector(),
@@ -41,12 +41,12 @@
 {
 }
 
-void NEDepthConcatenate::configure(std::vector<ITensor *> inputs_vector, ITensor *output) // NOLINT
+void NEDepthConcatenateLayer::configure(std::vector<ITensor *> inputs_vector, ITensor *output) // NOLINT
 {
     ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
 
     _num_inputs             = inputs_vector.size();
-    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<NEDepthConcatenateKernel[]>(_num_inputs);
+    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<NEDepthConcatenateLayerKernel[]>(_num_inputs);
     _border_handlers_vector = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
 
     TensorShape output_shape = calculate_depth_concatenate_shape(inputs_vector);
@@ -64,7 +64,7 @@
     }
 }
 
-void NEDepthConcatenate::run()
+void NEDepthConcatenateLayer::run()
 {
     for(unsigned i = 0; i < _num_inputs; ++i)
     {
diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
similarity index 83%
rename from src/runtime/NEON/functions/NEDepthConvert.cpp
rename to src/runtime/NEON/functions/NEDepthConvertLayer.cpp
index 37857b6..9a75404 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
@@ -21,18 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
 #include "support/ToolchainSupport.h"
 
 #include <utility>
 
 using namespace arm_compute;
 
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void NEDepthConvertLayer::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertLayerKernel>();
     k->configure(input, output, policy, shift);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
new file mode 100644
index 0000000..b890c6f
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
+    : _kernel(), _bias_kernel(), _border_handler(), _has_bias(false)
+{
+}
+
+void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
+
+    // Call convolution kernel
+    _kernel.configure(input, weights, output, conv_info);
+    _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+    if(biases != nullptr)
+    {
+        _bias_kernel.configure(output, biases);
+        _has_bias = true;
+    }
+}
+
+void NEDepthwiseConvolutionLayer3x3::run()
+{
+    NEScheduler::get().schedule(&_border_handler, Window::DimX);
+    NEScheduler::get().schedule(&_kernel, Window::DimX);
+    if(_has_bias)
+    {
+        NEScheduler::get().schedule(&_bias_kernel, Window::DimX);
+    }
+}
+
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
+    : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _input_reshaped(), _weights_reshaped(), _v2mm_output()
+{
+}
+
+void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
+
+    const size_t weights_w = weights->info()->dimension(0);
+    const size_t weights_h = weights->info()->dimension(1);
+    const size_t weights_z = weights->info()->dimension(2);
+
+    bool has_bias = (biases != nullptr);
+
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights_w, weights_h, conv_info);
+
+    // Set up intermediate tensors
+    const size_t patch_size = weights_w * weights_h + ((has_bias) ? 1 : 0);
+    const size_t conv_size  = conv_w * conv_h;
+
+    // Im2Col configuration
+    TensorShape shape_im2col = input->info()->tensor_shape();
+    shape_im2col.set(0, patch_size);
+    shape_im2col.set(1, conv_size);
+    shape_im2col.set(2, weights_z);
+    const TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    _input_reshaped.allocator()->init(info_im2col);
+    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, has_bias);
+
+    // Weights reshape configuration
+    const TensorShape shape_weights_reshape(patch_size, weights_z);
+    const TensorInfo  info_weights_reshape(shape_weights_reshape, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
+    _weights_reshaped.allocator()->init(info_weights_reshape);
+    _weights_reshape_kernel.configure(weights, &_weights_reshaped, biases);
+
+    // GEMV configuration
+    TensorShape shape_v2mm_out = input->info()->tensor_shape();
+    shape_v2mm_out.set(0, conv_size * weights_z);
+    shape_v2mm_out.set(1, 1);
+    shape_v2mm_out.set(2, 1);
+    const TensorInfo info_v2mm_out(shape_v2mm_out, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    _v2mm_output.allocator()->init(info_v2mm_out);
+    _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
+    _vector_to_tensor_kernel.configure(&_v2mm_output, output, conv_w, conv_h);
+
+    // Allocate intermediate tensors
+    _input_reshaped.allocator()->allocate();
+    _weights_reshaped.allocator()->allocate();
+    _v2mm_output.allocator()->allocate();
+}
+
+void NEDepthwiseConvolutionLayer::run()
+{
+    NEScheduler::get().schedule(&_im2col_kernel, Window::DimX);
+    NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
+    NEScheduler::get().schedule(&_v2mm_kernel, Window::DimX);
+    NEScheduler::get().schedule(&_vector_to_tensor_kernel, Window::DimX);
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp
new file mode 100644
index 0000000..d70a668
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+NEDepthwiseSeparableConvolutionLayer::NEDepthwiseSeparableConvolutionLayer()
+    : _depthwise_conv(), _pointwise_conv()
+{
+}
+
+void NEDepthwiseSeparableConvolutionLayer::configure(ITensor *input, const ITensor *depthwise_weights, const ITensor *depthwise_biases, ITensor *depthwise_out,
+                                                     const ITensor *pointwise_weights, const ITensor *pointwise_biases, ITensor *output,
+                                                     const PadStrideInfo &depthwise_conv_info, const PadStrideInfo &pointwise_conv_info)
+{
+    _depthwise_conv.configure(input, depthwise_weights, depthwise_biases, depthwise_out, depthwise_conv_info);
+    _pointwise_conv.configure(depthwise_out, pointwise_weights, pointwise_biases, output, pointwise_conv_info);
+}
+
+void NEDepthwiseSeparableConvolutionLayer::run()
+{
+    _depthwise_conv.run();
+    _pointwise_conv.run();
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 52a4cc1..afa5d97 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -34,7 +34,7 @@
 using namespace arm_compute;
 
 NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator()
+    : _memory_group(std::move(memory_manager)), _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator(), _has_bias(false)
 {
 }
 
@@ -46,38 +46,28 @@
         _accumulator.allocator()->free();
     }
 
+    // Check if bias should be added in the convolution result
+    _has_bias = (bias != nullptr);
+
     // Allocate the intermediate accumulator tensor in case of fixed point input
-    switch(output->info()->data_type())
+    if(is_data_type_fixed_point(input->info()->data_type()))
     {
-        case DataType::QS8:
+        const DataType promoted_dt = (input->info()->data_type() == DataType::QS8) ? DataType::QS16 : DataType::QS32;
+        _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, promoted_dt, output->info()->fixed_point_position()));
+        _memory_group.manage(&_accumulator);
+        _conv_kernel.configure(input, weights, &_accumulator, conv_info);
+        if(_has_bias)
         {
-            _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position()));
-            _memory_group.manage(&_accumulator);
-            _conv_kernel.configure(input, weights, &_accumulator, conv_info);
             _accumulate_bias_kernel.configure(&_accumulator, bias, output);
-            _accumulator.allocator()->allocate();
-            break;
         }
-        case DataType::QS16:
+        _accumulator.allocator()->allocate();
+    }
+    else
+    {
+        _conv_kernel.configure(input, weights, output, conv_info);
+        if(_has_bias)
         {
-            _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS32, output->info()->fixed_point_position()));
-            _memory_group.manage(&_accumulator);
-            _conv_kernel.configure(input, weights, &_accumulator, conv_info);
-            _accumulate_bias_kernel.configure(&_accumulator, bias, output);
-            _accumulator.allocator()->allocate();
-            break;
-        }
-        case DataType::F16:
-        case DataType::F32:
-        {
-            _conv_kernel.configure(input, weights, output, conv_info);
             _accumulate_bias_kernel.configure(output, bias);
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Data type not supported");
-            break;
         }
     }
 
@@ -85,6 +75,38 @@
     _input_border_handler.configure(input, _conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
 }
 
+Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+
+    DataType data_type = output->data_type();
+    if(is_data_type_fixed_point(data_type))
+    {
+        // Promote data type in case of fixed point
+        data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32);
+    }
+    TensorInfo accumulator(output->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type));
+
+    // Validate Convolution kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerKernel::validate(input, weights, &accumulator, conv_info));
+
+    // Validate bias
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((bias == nullptr) && is_data_type_fixed_point(data_type),
+                                    "Biases should be provided for fixed point inputs");
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3),
+                                        "Biases size and number of input feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional");
+
+        // Validate bias kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerBiasAccumulateKernel::validate(&accumulator, bias, output));
+    }
+
+    return Status{};
+}
+
 void NEDirectConvolutionLayer::run()
 {
     NEScheduler::get().schedule(&_input_border_handler, Window::DimZ);
@@ -92,7 +114,10 @@
     _memory_group.acquire();
 
     NEScheduler::get().schedule(&_conv_kernel, Window::DimZ);
-    NEScheduler::get().schedule(&_accumulate_bias_kernel, Window::DimY);
+    if(_has_bias)
+    {
+        NEScheduler::get().schedule(&_accumulate_bias_kernel, Window::DimY);
+    }
 
     _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 2e8d105..fc04e28 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -133,7 +133,7 @@
     const int      num_input_dimensions = input->info()->tensor_shape().num_dimensions() - num_batch_dimensions;
     const size_t   linear_input_size    = input->info()->tensor_shape().total_size_lower(num_input_dimensions);
 
-    _linearize_input      = input->info()->tensor_shape().x() != linear_input_size;
+    _linearize_input      = (input->info()->tensor_shape().x() != linear_input_size) || (num_input_dimensions > 1 && linear_input_size == 1);
     _are_weights_reshaped = are_weights_reshaped;
     _accumulate_biases    = biases != nullptr;
     _is_batched_fc_layer  = num_batch_dimensions > 0;
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index ff92ef8..950f4c9 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
 #include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
@@ -39,6 +40,7 @@
 {
 #include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
 #include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp"
 #include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
 } // namespace arm_compute
 
@@ -96,6 +98,14 @@
         {
             _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
         }
+        else if(a->info()->data_type() == DataType::F16 && (c == nullptr || beta == 0.f))
+        {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            _mm_optimised_kernel = support::cpp14::make_unique<NEHGEMMAArch64FP16Kernel>();
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+            ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        }
 #endif /* defined(__arm__) || defined(__aarch64__) */
 
 #if defined(__arm__) || defined(__aarch64__)
@@ -107,19 +117,32 @@
             const int N = d->info()->tensor_shape().x();
             const int K = a->info()->tensor_shape().x();
 
+            size_t workbench_size = 0;
+
 #if defined(__arm__)
-            GemmInterleaved<sgemm_8x6, float, float> gemm(&ci, M, N, K, false, false);
+            workbench_size = GemmInterleaved<sgemm_8x6, sgemm_8x6::operand_type, sgemm_8x6::result_type>(&ci, M, N, K, false, false).get_working_size();
 #elif defined(__aarch64__)
-            GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
+            if(a->info()->data_type() == DataType::F32)
+            {
+                workbench_size = GemmInterleaved<sgemm_12x8, sgemm_12x8::operand_type, sgemm_12x8::result_type>(&ci, M, N, K, false, false).get_working_size();
+            }
+            else if(a->info()->data_type() == DataType::F16)
+            {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                workbench_size = GemmInterleaved<hgemm_24x8, hgemm_24x8::operand_type, hgemm_24x8::result_type>(&ci, M, N, K, false, false).get_working_size();
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+                ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+            }
 #endif /* defined(__arm__) || defined(__aarch64__) */
 
             constexpr size_t alignment = 4096;
-            _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+            ARM_COMPUTE_ERROR_ON_MSG(workbench_size == 0, "size cannot be 0");
+            _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::S8));
             _memory_group.manage(&_workspace);
 
             // Configure matrix multiplication kernel
             _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f);
-
             _workspace.allocator()->allocate();
         }
         else
diff --git a/src/runtime/NEON/functions/NEGEMMLowp.cpp b/src/runtime/NEON/functions/NEGEMMLowp.cpp
deleted file mode 100644
index 7413b28..0000000
--- a/src/runtime/NEON/functions/NEGEMMLowp.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowp.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-
-using namespace arm_compute;
-
-NEGEMMLowp::NEGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
-{
-}
-
-void NEGEMMLowp::configure(const ITensor *a, const ITensor *b, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
-    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
-    ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
-
-    /* The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] */
-    TensorShape shape_tmp_a = a->info()->tensor_shape();
-    shape_tmp_a.set(0, a->info()->dimension(0) * 4);
-    shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
-
-    TensorShape shape_tmp_b = b->info()->tensor_shape();
-    shape_tmp_b.set(0, b->info()->dimension(1) * 16);
-    shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
-
-    TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
-    TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
-    _tmp_a.allocator()->init(info_a);
-    _tmp_b.allocator()->init(info_b);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_tmp_a);
-    _memory_group.manage(&_tmp_b);
-
-    _interleave_kernel.configure(a, &_tmp_a);
-    _transpose_kernel.configure(b, &_tmp_b);
-    _mm_kernel.configure(&_tmp_a, &_tmp_b, output, a_offset, b_offset, output_offset, output_mult_int, shift);
-
-    _tmp_a.allocator()->allocate();
-    _tmp_b.allocator()->allocate();
-}
-
-void NEGEMMLowp::run()
-{
-    _memory_group.acquire();
-
-    /* Run interleave kernel */
-    NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
-
-    /* Run transpose kernel */
-    NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
-
-    /* Run matrix multiply kernel */
-    NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
-
-    _memory_group.release();
-}
diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
new file mode 100644
index 0000000..6e03ffa
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
@@ -0,0 +1,216 @@
+/* Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp"
+} // namespace arm_compute
+
+using namespace arm_compute;
+
+NEGEMMLowpAssemblyMatrixMultiplyCore::NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b(), _workspace()
+{
+}
+
+void NEGEMMLowpAssemblyMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::S8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+    ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
+    ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
+
+#ifdef __aarch64__
+    const int            M                   = output->info()->tensor_shape().y();
+    const int            N                   = output->info()->tensor_shape().x();
+    const int            K                   = a->info()->tensor_shape().x();
+    constexpr size_t     workspace_alignment = 4096;
+    const struct CPUInfo ci                  = NEScheduler::get().cpu_info();
+#endif /* __aarch64__ */
+
+#ifdef ARM_COMPUTE_AARCH64_V8_2
+    if(ci.CPU == CPUTarget::A75_DOT)
+    {
+        // Configure matrix multiply kernel
+        GemmInterleaved<gemm_s8_12x8, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
+        _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+        _memory_group.manage(&_workspace);
+
+        // Configure matrix multiplication kernel
+        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>();
+        k->configure(a, b, output, &_workspace, 1.f, 1.f);
+        _mm_kernel = std::move(k);
+        _workspace.allocator()->allocate();
+    }
+    else if(ci.CPU == CPUTarget::A55_DOT)
+    {
+        ARM_COMPUTE_ERROR_ON("WIP");
+    }
+    else
+#elif defined(ARM_COMPUTE_AARCH64_V8A)
+    if(ci.CPU == CPUTarget::A53)
+    {
+        switch(a->info()->data_type())
+        {
+            case DataType::S8:
+            {
+                // Configure matrix multiply kernel
+                GemmInterleaved<gemm_s16_12x8, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
+                _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+            }
+            break;
+            case DataType::U8:
+            {
+                // Configure matrix multiply kernel
+                GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t> gemm(&ci, M, N, K, false, false);
+                _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+            }
+            break;
+            default:
+                ARM_COMPUTE_ERROR("Datatype not supported");
+        }
+
+        _memory_group.manage(&_workspace);
+        // Configure matrix multiplication kernel
+        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64A53Kernel>();
+        k->configure(a, b, output, &_workspace, 1.f, 1.f);
+        _mm_kernel = std::move(k);
+        _workspace.allocator()->allocate();
+    }
+    else if(1) // Generic v8a kernel
+    {
+        switch(a->info()->data_type())
+        {
+            case DataType::S8:
+            {
+                // Configure matrix multiply kernel
+                GemmInterleaved<gemm_s8_4x4, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
+                _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+            }
+            break;
+            case DataType::U8:
+            {
+                // Configure matrix multiply kernel
+                GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t> gemm(&ci, M, N, K, false, false);
+                _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+            }
+            break;
+            default:
+                ARM_COMPUTE_ERROR("Datatype not supported");
+        }
+        _memory_group.manage(&_workspace);
+        // Configure matrix multiplication kernel
+        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64Kernel>();
+        k->configure(a, b, output, &_workspace, 1.f, 1.f);
+        _mm_kernel = std::move(k);
+        _workspace.allocator()->allocate();
+    }
+    else
+#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+    {
+        // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+        TensorShape shape_tmp_a = a->info()->tensor_shape();
+        shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+        shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
+
+        // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+        TensorShape shape_tmp_b = b->info()->tensor_shape();
+        shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+        shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
+
+        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+        _tmp_a.allocator()->init(info_a);
+        _tmp_b.allocator()->init(info_b);
+        _memory_group.manage(&_tmp_a);
+        _memory_group.manage(&_tmp_b);
+
+        // Configure interleave kernel
+        {
+            auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+            k->configure(a, &_tmp_a);
+            _mtx_a_reshape_kernel = std::move(k);
+        }
+
+        // Configure transpose kernel
+        {
+            auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+            k->configure(b, &_tmp_b);
+            _mtx_b_reshape_kernel = std::move(k);
+        }
+
+        // Configure matrix multiply kernel
+        {
+            auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+            k->configure(&_tmp_a, &_tmp_b, output);
+            _mm_kernel = std::move(k);
+        }
+
+        // Allocate tensors
+        _tmp_a.allocator()->allocate();
+        _tmp_b.allocator()->allocate();
+    }
+}
+
+void NEGEMMLowpAssemblyMatrixMultiplyCore::run()
+{
+    _memory_group.acquire();
+    if(_mtx_a_reshape_kernel)
+    {
+        NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
+    }
+
+    if(_mtx_b_reshape_kernel)
+    {
+        NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+    }
+
+    NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+
+    _memory_group.release();
+}
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
new file mode 100644
index 0000000..50aa5b6
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp"
+} // namespace arm_compute
+
+using namespace arm_compute;
+
+NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
+      _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _dot_product_path(false)
+{
+}
+
+void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, ITensor *output, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+    ARM_COMPUTE_UNUSED(gemm_info);
+    ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));
+
+    _a_offset                         = a->info()->quantization_info().offset;
+    _b_offset                         = b->info()->quantization_info().offset;
+    _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
+
+#ifdef ARM_COMPUTE_AARCH64_V8_2
+    // Check for DOT product instruction
+    const struct CPUInfo ci              = NEScheduler::get().cpu_info();
+    const int            cpu_has_dotprod = static_cast<int>(ci.CPU) & static_cast<int>(CPUTarget::DOT);
+
+    if(cpu_has_dotprod != 0)
+    {
+        _dot_product_path = true;
+
+        // Configure matrix multiply kernel
+        struct CPUInfo ci = NEScheduler::get().cpu_info();
+        const int      M  = output->info()->tensor_shape().y();
+        const int      N  = output->info()->tensor_shape().x();
+        const int      K  = a->info()->tensor_shape().x();
+
+        const size_t     workbench_size = GemmInterleaved<gemm_u8_12x8, gemm_u8_12x8::operand_type, gemm_u8_12x8::result_type>(&ci, M, N, K, false, false).get_working_size();
+        constexpr size_t alignment      = 4096;
+        _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+        _memory_group.manage(&_workspace);
+
+        // Configure matrix multiplication kernel
+        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>();
+        k->configure(a, b, output, &_workspace, 1.f, 1.f);
+        _mm_kernel = std::move(k);
+    }
+    else
+#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+    {
+        if(_run_vector_matrix_multiplication)
+        {
+            // Configure matrix multiply kernel
+            {
+                auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+                k->configure(a, b, output);
+                _mm_kernel = std::move(k);
+            }
+        }
+        else
+        {
+            // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+            TensorShape shape_tmp_a = a->info()->tensor_shape();
+            shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+            shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
+
+            // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+            TensorShape shape_tmp_b = b->info()->tensor_shape();
+            shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+            shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
+
+            TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+            TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+            _tmp_a.allocator()->init(info_a);
+            _tmp_b.allocator()->init(info_b);
+            _memory_group.manage(&_tmp_a);
+            _memory_group.manage(&_tmp_b);
+
+            // Configure interleave kernel
+            {
+                auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+                k->configure(a, &_tmp_a);
+                _mtx_a_reshape_kernel = std::move(k);
+            }
+
+            // Configure transpose kernel
+            {
+                auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+                k->configure(b, &_tmp_b);
+                _mtx_b_reshape_kernel = std::move(k);
+            }
+
+            // Configure matrix multiply kernel
+            {
+                auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+                k->configure(&_tmp_a, &_tmp_b, output);
+                _mm_kernel = std::move(k);
+            }
+        }
+    }
+
+    // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
+    if(_a_offset != 0)
+    {
+        TensorShape shape_vector_sum_col = b->info()->tensor_shape();
+        if(b->info()->num_dimensions() > 1)
+        {
+            shape_vector_sum_col.remove_dimension(1);
+        }
+        TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32);
+        _vector_sum_col.allocator()->init(info_vector_sum_col);
+        _memory_group.manage(&_vector_sum_col);
+
+        // Configure Matrix B reduction kernel
+        _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a->info()->dimension(0), false);
+    }
+
+    // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
+    if(_b_offset != 0)
+    {
+        TensorShape shape_vector_sum_row = a->info()->tensor_shape();
+        shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1));
+        if(a->info()->num_dimensions() > 1)
+        {
+            shape_vector_sum_row.remove_dimension(1);
+        }
+        TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32);
+        _vector_sum_row.allocator()->init(info_vector_sum_row);
+        _memory_group.manage(&_vector_sum_row);
+
+        // Configure matrix A reduction kernel
+        _mtx_a_reduction_kernel.configure(a, &_vector_sum_row, a->info()->dimension(0), false);
+    }
+
+    // Configure offset contribution kernel
+    _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
+
+    // Allocate tensors
+    if(!_dot_product_path && !_run_vector_matrix_multiplication)
+    {
+        _tmp_a.allocator()->allocate();
+        _tmp_b.allocator()->allocate();
+    }
+    else
+    {
+        _workspace.allocator()->allocate();
+    }
+
+    if(_a_offset != 0)
+    {
+        _vector_sum_col.allocator()->allocate();
+    }
+
+    if(_b_offset != 0)
+    {
+        _vector_sum_row.allocator()->allocate();
+    }
+}
+
+Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
+                                    "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(1) != (output)->dimension(1),
+                                    "The output matrix must have the same number of rows as the matrix A");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((b)->dimension(0) != (output)->dimension(0),
+                                    "The output matrix must have the same number of columns as the matrix B");
+    ARM_COMPUTE_UNUSED(gemm_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+
+    int32_t a_offset                         = a->quantization_info().offset;
+    int32_t b_offset                         = b->quantization_info().offset;
+    bool    run_vector_matrix_multiplication = a->dimension(1) < 2;
+
+#ifdef ARM_COMPUTE_AARCH64_V8_2
+    // Check for DOT product instruction
+    const struct CPUInfo ci              = NEScheduler::get().cpu_info();
+    const int            cpu_has_dotprod = static_cast<int>(ci.CPU) & static_cast<int>(CPUTarget::DOT);
+
+    if(cpu_has_dotprod != 0)
+    {
+        // Validate matrix multiply kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpAArch64V8P4Kernel::validate(a, b, output));
+    }
+    else
+#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+    {
+        if(!run_vector_matrix_multiplication)
+        {
+            // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+            TensorShape shape_tmp_a = a->tensor_shape();
+            shape_tmp_a.set(0, a->dimension(0) * 4);
+            shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
+
+            // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+            TensorShape shape_tmp_b = b->tensor_shape();
+            shape_tmp_b.set(0, b->dimension(1) * 16);
+            shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
+
+            TensorInfo info_a(shape_tmp_a, 1, a->data_type());
+            TensorInfo info_b(shape_tmp_b, 1, b->data_type());
+
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
+        }
+    }
+
+    TensorInfo info_vector_sum_col, info_vector_sum_row;
+
+    // Validate matrix B reduction kernel only if _a_offset is not equal to 0
+    if(a_offset != 0)
+    {
+        TensorShape shape_vector_sum_col = b->tensor_shape();
+        shape_vector_sum_col.remove_dimension(1);
+        info_vector_sum_col = TensorInfo(shape_vector_sum_col, 1, DataType::S32);
+
+        // Configure Matrix B reduction kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, a->dimension(0), false));
+    }
+
+    // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
+    if(b_offset != 0)
+    {
+        TensorShape shape_vector_sum_row = a->tensor_shape();
+        shape_vector_sum_row.set(Window::DimX, a->dimension(1));
+        shape_vector_sum_row.remove_dimension(1);
+        info_vector_sum_row = TensorInfo(shape_vector_sum_row, 1, DataType::S32);
+
+        // Configure matrix A reduction kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, a->dimension(0), false));
+    }
+
+    // Validate offset contribution kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,
+                                                                             a_offset == 0 ? nullptr : &info_vector_sum_col,
+                                                                             b_offset == 0 ? nullptr : &info_vector_sum_row,
+                                                                             a_offset, b_offset));
+
+    return Status{};
+}
+
+void NEGEMMLowpMatrixMultiplyCore::run()
+{
+    _memory_group.acquire();
+
+    // Do not reshape if we run the vector-by-matrix case and we do not have the optimized gemm with dot product instruction
+    if(!_run_vector_matrix_multiplication && !_dot_product_path)
+    {
+        if(_mtx_a_reshape_kernel)
+        {
+            NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
+        }
+
+        if(_mtx_b_reshape_kernel)
+        {
+            NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+        }
+    }
+
+    NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+
+    // Run matrix A reduction kernel only if _b_offset is not equal to 0
+    if(_b_offset != 0)
+    {
+        NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
+    }
+
+    // Run matrix B reduction kernel only if _a_offset is not equal to 0
+    if(_a_offset != 0)
+    {
+        NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+    }
+
+    // Run offset contribution kernel
+    NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
+
+    _memory_group.release();
+}
diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
new file mode 100644
index 0000000..8c02436
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void NEGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel>();
+    k->configure(input, bias, output, result_offset, result_mult_int, result_shift, min, max);
+    _kernel = std::move(k);
+}
+
+Status NEGEMMLowpQuantizeDownInt32ToUint8Scale::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+    return NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::validate(input, bias, output, min, max);
+}
+
+void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift,
+                                                                    int result_offset_after_shift, int min, int max)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
+    k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
+    _kernel = std::move(k);
+}
+
+Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+    return NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, min, max);
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
index 84ea0ca..8a85bba 100644
--- a/src/runtime/NEON/functions/NEGaussianPyramid.cpp
+++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
@@ -47,7 +47,8 @@
 }
 
 NEGaussianPyramidHalf::NEGaussianPyramidHalf() // NOLINT
-    : _border_handler(),
+    : _horizontal_border_handler(),
+      _vertical_border_handler(),
       _horizontal_reduction(),
       _vertical_reduction()
 {
@@ -62,6 +63,9 @@
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
     ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_HALF != pyramid->info()->scale());
 
+    // Constant value to use for vertical fill border when the border mode is CONSTANT
+    const uint16_t pixel_value_u16 = static_cast<uint16_t>(constant_border_value) * 2 + static_cast<uint16_t>(constant_border_value) * 8 + static_cast<uint16_t>(constant_border_value) * 6;
+
     /* Get number of pyramid levels */
     const size_t num_levels = pyramid->info()->num_levels();
 
@@ -70,9 +74,10 @@
 
     if(num_levels > 1)
     {
-        _border_handler       = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
-        _horizontal_reduction = arm_compute::support::cpp14::make_unique<NEGaussianPyramidHorKernel[]>(num_levels - 1);
-        _vertical_reduction   = arm_compute::support::cpp14::make_unique<NEGaussianPyramidVertKernel[]>(num_levels - 1);
+        _horizontal_border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
+        _vertical_border_handler   = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
+        _horizontal_reduction      = arm_compute::support::cpp14::make_unique<NEGaussianPyramidHorKernel[]>(num_levels - 1);
+        _vertical_reduction        = arm_compute::support::cpp14::make_unique<NEGaussianPyramidVertKernel[]>(num_levels - 1);
 
         // Apply half scale to the X dimension of the tensor shape
         TensorShape tensor_shape = pyramid->info()->tensor_shape();
@@ -84,13 +89,16 @@
         for(unsigned int i = 0; i < num_levels - 1; ++i)
         {
             /* Configure horizontal kernel */
-            _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode == BorderMode::UNDEFINED);
+            _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
 
             /* Configure vertical kernel */
-            _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), border_mode == BorderMode::UNDEFINED);
+            _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
 
             /* Configure border */
-            _border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+            _horizontal_border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+
+            /* Configure border */
+            _vertical_border_handler[i].configure(_tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16));
         }
 
         _tmp.allocate();
@@ -109,8 +117,9 @@
 
     for(unsigned int i = 0; i < num_levels - 1; ++i)
     {
-        NEScheduler::get().schedule(_border_handler.get() + i, Window::DimZ);
+        NEScheduler::get().schedule(_horizontal_border_handler.get() + i, Window::DimZ);
         NEScheduler::get().schedule(_horizontal_reduction.get() + i, Window::DimY);
+        NEScheduler::get().schedule(_vertical_border_handler.get() + i, Window::DimZ);
         NEScheduler::get().schedule(_vertical_reduction.get() + i, Window::DimY);
     }
 }
diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/NEON/functions/NEIm2Col.cpp
similarity index 64%
copy from src/runtime/NEON/functions/NEDepthConvert.cpp
copy to src/runtime/NEON/functions/NEIm2Col.cpp
index 37857b6..8e90e66 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/NEON/functions/NEIm2Col.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEIm2Col.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
 #include "support/ToolchainSupport.h"
 
-#include <utility>
-
 using namespace arm_compute;
 
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
-    k->configure(input, output, policy, shift);
+    auto k = arm_compute::support::cpp14::make_unique<NEIm2ColKernel>();
+    k->configure(input, output, kernel_dims, conv_info, has_bias);
     _kernel = std::move(k);
 }
+
+Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+    return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias);
+}
diff --git a/src/runtime/NEON/functions/NEL2Normalize.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
similarity index 86%
rename from src/runtime/NEON/functions/NEL2Normalize.cpp
rename to src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index 349a781..fa62483 100644
--- a/src/runtime/NEON/functions/NEL2Normalize.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
@@ -21,19 +21,19 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEL2Normalize.h"
+#include "arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 using namespace arm_compute;
 
-NEL2Normalize::NEL2Normalize(std::shared_ptr<IMemoryManager> memory_manager)
+NEL2NormalizeLayer::NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
 {
 }
 
-void NEL2Normalize::configure(ITensor *input, ITensor *output, unsigned int axis, float epsilon)
+void NEL2NormalizeLayer::configure(ITensor *input, ITensor *output, unsigned int axis, float epsilon)
 {
     // Manage intermediate buffers
     _memory_group.manage(&_sumsq);
@@ -46,7 +46,7 @@
     _sumsq.allocator()->allocate();
 }
 
-void NEL2Normalize::run()
+void NEL2NormalizeLayer::run()
 {
     _memory_group.acquire();
 
diff --git a/src/runtime/NEON/functions/NELaplacianPyramid.cpp b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
index a680f1f..0e149d4 100644
--- a/src/runtime/NEON/functions/NELaplacianPyramid.cpp
+++ b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
@@ -28,7 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
 #include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
 #include "arm_compute/runtime/Tensor.h"
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index cb48598..b29b796 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -111,7 +111,7 @@
     _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
     _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
     _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
-    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+    _output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h));
 
     // Allocate intermediate tensors
     _weights_reshaped.allocator()->allocate();
diff --git a/src/runtime/NEON/functions/NEMagnitude.cpp b/src/runtime/NEON/functions/NEMagnitude.cpp
index 7877995..f865054 100644
--- a/src/runtime/NEON/functions/NEMagnitude.cpp
+++ b/src/runtime/NEON/functions/NEMagnitude.cpp
@@ -31,18 +31,36 @@
 
 using namespace arm_compute;
 
-void NEMagnitude::configure(const ITensor *input1, const ITensor *input2, ITensor *output, bool use_fp16)
+void NEMagnitude::configure(const ITensor *input1, const ITensor *input2, ITensor *output, MagnitudeType mag_type, bool use_fp16)
 {
     if(use_fp16)
     {
-        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
-        k->configure(input1, input2, output, nullptr);
-        _kernel = std::move(k);
+        if(mag_type == MagnitudeType::L1NORM)
+        {
+            auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseFP16Kernel<MagnitudeType::L1NORM, PhaseType::SIGNED>>();
+            k->configure(input1, input2, output, nullptr);
+            _kernel = std::move(k);
+        }
+        else
+        {
+            auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+            k->configure(input1, input2, output, nullptr);
+            _kernel = std::move(k);
+        }
     }
     else
     {
-        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
-        k->configure(input1, input2, output, nullptr);
-        _kernel = std::move(k);
+        if(mag_type == MagnitudeType::L1NORM)
+        {
+            auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::SIGNED>>();
+            k->configure(input1, input2, output, nullptr);
+            _kernel = std::move(k);
+        }
+        else
+        {
+            auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+            k->configure(input1, input2, output, nullptr);
+            _kernel = std::move(k);
+        }
     }
 }
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index e01ef66..af98ac1 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -37,9 +37,9 @@
 {
 }
 
-void NENormalizationLayer::configure(const ITensor *input, ITensor *output, NormalizationLayerInfo norm_info)
+void NENormalizationLayer::configure(const ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info)
 {
-    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
     _input_squared.allocator()->init(tensor_info);
@@ -56,6 +56,17 @@
     _input_squared.allocator()->allocate();
 }
 
+Status NENormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+{
+    // Perform validation step
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NENormalizationLayerKernel::validate(input, input, output, norm_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+
+    return Status{};
+}
+
 void NENormalizationLayer::run()
 {
     _memory_group.acquire();
diff --git a/src/runtime/NEON/functions/NEPhase.cpp b/src/runtime/NEON/functions/NEPhase.cpp
index 436d22f..6392281 100644
--- a/src/runtime/NEON/functions/NEPhase.cpp
+++ b/src/runtime/NEON/functions/NEPhase.cpp
@@ -30,9 +30,18 @@
 
 using namespace arm_compute;
 
-void NEPhase::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+void NEPhase::configure(const ITensor *input1, const ITensor *input2, ITensor *output, PhaseType phase_type)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
-    k->configure(input1, input2, nullptr, output);
-    _kernel = std::move(k);
+    if(phase_type == PhaseType::UNSIGNED)
+    {
+        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>>();
+        k->configure(input1, input2, nullptr, output);
+        _kernel = std::move(k);
+    }
+    else
+    {
+        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+        k->configure(input1, input2, nullptr, output);
+        _kernel = std::move(k);
+    }
 }
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index 2e2ea11..5a474e4 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -36,3 +36,7 @@
     k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
     _kernel = std::move(k);
 }
+Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+{
+    return NEPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy);
+}
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index f8a85b9..530c7fc 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -48,6 +48,11 @@
     _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, PixelValue(static_cast<float>(0.f)));
 }
 
+Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+{
+    return NEPoolingLayerKernel::validate(input, output, pool_info);
+}
+
 void NEPoolingLayer::run()
 {
     // Fill border
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index bbd3fac..bd565c9 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -42,9 +42,11 @@
 
 namespace
 {
-void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size)
+void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size, SamplingPolicy sampling_policy)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == offsets);
+    ARM_COMPUTE_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
+    ARM_COMPUTE_UNUSED(sampling_policy);
 
     Window win;
     win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1));
@@ -95,7 +97,7 @@
 {
 }
 
-void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value)
+void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == input);
     ARM_COMPUTE_ERROR_ON(nullptr == output);
@@ -131,13 +133,13 @@
             TensorInfo tensor_info_offsets(shape, Format::S32);
             _offsets.allocator()->init(tensor_info_offsets);
 
-            _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined);
+            _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined, sampling_policy);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
 
             // Pre-compute offsets for nearest interpolation
-            precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, input_element_size);
+            precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, input_element_size, sampling_policy);
             break;
         }
         case InterpolationPolicy::BILINEAR:
@@ -149,7 +151,7 @@
             _dx.allocator()->init(tensor_info_dxdy);
             _dy.allocator()->init(tensor_info_dxdy);
 
-            _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_undefined);
+            _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_undefined, sampling_policy);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
@@ -157,7 +159,7 @@
             _dy.allocator()->allocate();
 
             // Pre-compute dx, dy and offsets for bilinear interpolation
-            precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, input_element_size);
+            precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, input_element_size, sampling_policy);
             break;
         }
         case InterpolationPolicy::AREA:
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index cc5d4e9..8e6773c 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -36,9 +36,9 @@
 {
 }
 
-void NESoftmaxLayer::configure(ITensor *input, ITensor *output)
+void NESoftmaxLayer::configure(ITensor *input, ITensor *output, float beta)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Create intermediate tensors shapes
     TensorInfo tensor_info_tmp(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
@@ -57,7 +57,7 @@
 
     // Configure Kernels
     _max_kernel.configure(input, &_max);
-    _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
+    _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta);
     _norm_kernel.configure(&_tmp, &_sum, output);
     _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::REPLICATE);
 
@@ -67,6 +67,23 @@
     _sum.allocator()->allocate();
 }
 
+Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta)
+{
+    // Perform validation step
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+    TensorShape max_sum_shape = input->tensor_shape();
+    max_sum_shape.set(0, 1);
+
+    TensorInfo tensor_info_max_sum(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(max_sum_shape));
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DMaxKernel::validate(input, &tensor_info_max_sum));
+    ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DShiftExpSumKernel::validate(input, &tensor_info_max_sum, input, &tensor_info_max_sum, beta));
+    ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DNormKernel::validate(input, &tensor_info_max_sum, output));
+
+    return Status{};
+}
+
 void NESoftmaxLayer::run()
 {
     _memory_group.acquire();
diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp
index eb81e02..b5b28e8 100644
--- a/src/runtime/NEON/functions/NETranspose.cpp
+++ b/src/runtime/NEON/functions/NETranspose.cpp
@@ -36,3 +36,8 @@
     k->configure(input, output);
     _kernel = std::move(k);
 }
+
+Status NETranspose::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return NETransposeKernel::validate(input, output);
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEWinogradLayer.cpp b/src/runtime/NEON/functions/NEWinogradLayer.cpp
new file mode 100644
index 0000000..3251de4
--- /dev/null
+++ b/src/runtime/NEON/functions/NEWinogradLayer.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEWinogradLayer.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
+
+namespace
+{
+inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input)
+{
+    const int in_width    = input->info()->dimension(0);
+    const int in_height   = input->info()->dimension(1);
+    const int in_batches  = input->info()->dimension(3);
+    const int in_channels = input->info()->dimension(2);
+    return Tensor4DShape({ in_batches, in_height, in_width, in_channels });
+}
+} /* namespace */
+
+namespace arm_compute
+{
+NEWinogradLayer::NEWinogradLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _winograd_kernel(), _weights_workspace(), _workspace(), _kernel_storage(), _input(), _weights(), _output(), _reshaped_kernel(false), _conv()
+{
+} /* arm_compute */
+
+void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(1) != 3 || weights->info()->dimension(0) != 3, "Only 3x3 kernels are supported");
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
+    _weights = weights;
+    _input   = input;
+    _output  = output;
+
+    // Get parameters from conv_info
+    unsigned int stride_x = 0;
+    unsigned int stride_y = 0;
+    std::tie(stride_x, stride_y) = conv_info.stride();
+    ARM_COMPUTE_ERROR_ON_MSG(stride_y != 1 || stride_x != 1, "Winograd layer only supports unit strides.");
+
+    // Get convolved dimensions
+    auto      padding     = PADDING_VALID;
+    const int in_channels = input->info()->dimension(2);
+
+    const int out_channels   = output->info()->dimension(2);
+    const int weights_width  = weights->info()->dimension(0);
+    const int weights_height = weights->info()->dimension(1);
+
+    const KernelShape   kernel_shape({ out_channels, weights_height, weights_width, in_channels });
+    const Tensor4DShape in_shape(internal_get_input_shape(input));
+
+    // Get the memory required to instantiate a new Winograd operator.
+    constexpr size_t kstore_alignment          = 64;
+    const size_t     kernel_storage_per_thread = NEWinogradLayerKernel::get_kernel_storage_size(kernel_shape);
+    _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_per_thread + kstore_alignment - 1) }, 1, DataType::U8));
+    _memory_group.manage(&_kernel_storage);
+
+    // Get workbench size and allocate memory
+    constexpr size_t wspace_alignment = 64;
+    const size_t     ws_size          = NEWinogradLayerKernel::get_working_space_size(in_shape, kernel_shape, padding);
+    _workspace.allocator()->init(TensorInfo(TensorShape{ (ws_size + wspace_alignment - 1) }, 1, DataType::U8));
+    _memory_group.manage(&_workspace);
+
+    // Workspace for weights transform
+    const size_t weights_transform_size = NEWinogradLayerKernel::get_kernel_transform_working_size(kernel_shape);
+    _weights_workspace.allocator()->init(TensorInfo(TensorShape{ (weights_transform_size + wspace_alignment - 1) }, 1, DataType::U8));
+    _memory_group.manage(&_weights_workspace);
+
+    _kernel_storage.allocator()->allocate();
+    _workspace.allocator()->allocate();
+    _weights_workspace.allocator()->allocate();
+
+    // Create Winograd operator object
+    _conv = support::cpp14::make_unique<Winograd3x3F32>(kernel_shape, in_shape, padding, _kernel_storage.buffer());
+
+    // Configure the kernel, padding not needed so it's safe to call configure after allocare
+    _winograd_kernel.configure(output, _conv.get());
+}
+
+void NEWinogradLayer::run()
+{
+#if defined(__aarch64__)
+    _memory_group.acquire();
+    if(!_reshaped_kernel)
+    {
+        _conv->transform_weights(reinterpret_cast<const float *>(_weights->buffer()), reinterpret_cast<float *>(_weights_workspace.buffer()));
+        _reshaped_kernel = true;
+    }
+    const Tensor4DShape in_shape(internal_get_input_shape(_input));
+    auto                padding = PADDING_VALID;
+
+    //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
+    _conv->nchw2nhwc(in_shape, padding, _workspace.buffer(), reinterpret_cast<const float *>(_input->buffer()));
+
+    //Get ptrs into the workspace
+    std::pair<void *, void *> nhwc_ptrs = _conv->get_nhwc_ptrs(in_shape, padding, _workspace.buffer());
+
+    //Setup matrices ptrs and transfor the input tensor to the appropriate form before running GEMM.
+    _conv->reshape_input(in_shape, padding, nhwc_ptrs.second, _workspace.buffer());
+
+    //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
+    NEScheduler::get().schedule(&_winograd_kernel, Window::DimY);
+
+    //Transform the output to the appropriate form
+    _conv->reshape_output(in_shape, padding, nhwc_ptrs.first);
+
+    //Transform back to NCHW
+    _conv->nhwc2nchw(in_shape, padding, _workspace.buffer(), reinterpret_cast<float *>(_output->buffer()));
+
+    _memory_group.release();
+#else  /* __aarch64__ */
+    ARM_COMPUTE_UNUSED(_winograd_kernel);
+    ARM_COMPUTE_UNUSED(_workspace);
+    ARM_COMPUTE_UNUSED(_kernel_storage);
+    ARM_COMPUTE_UNUSED(_input);
+    ARM_COMPUTE_UNUSED(_weights);
+    ARM_COMPUTE_UNUSED(_output);
+    ARM_COMPUTE_UNUSED(_reshaped_kernel);
+    ARM_COMPUTE_UNUSED(_conv);
+    ARM_COMPUTE_ERROR("Winograd only supported for aarch64, recompile with arch=arm64-v8a.");
+#endif /* __aarch64__ */
+}
+} // namespace arm_compute