arm_compute v18.08
diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp
index 534c4f9..bbc513d 100644
--- a/src/runtime/CL/CLMemory.cpp
+++ b/src/runtime/CL/CLMemory.cpp
@@ -61,7 +61,7 @@
 
 void CLMemory::create_empty_region()
 {
-    _region_owned = std::make_shared<CLBufferMemoryRegion>(cl::Context::getDefault(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 0);
+    _region_owned = std::make_shared<CLBufferMemoryRegion>(cl::Context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 0);
     _region       = _region_owned.get();
 }
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index fdae615..a311c6f 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -25,13 +25,24 @@
 
 #include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
+#include "arm_compute/runtime/CL/tuners/Tuners.h"
 
 using namespace arm_compute;
 
+namespace
+{
+#if defined(ARM_COMPUTE_DEBUG_ENABLED)
+void printf_callback(const char *buffer, unsigned int len, size_t complete, void *user_data)
+{
+    printf("%.*s", len, buffer);
+}
+#endif /* defined(ARM_COMPUTE_DEBUG_ENABLED) */
+} // namespace
+
 std::once_flag CLScheduler::_initialize_symbols;
 
 CLScheduler::CLScheduler()
-    : _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner()
+    : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner(nullptr), _cl_default_static_tuner(nullptr)
 {
 }
 
@@ -42,6 +53,61 @@
     return scheduler;
 }
 
+void CLScheduler::default_init(ICLTuner *cl_tuner)
+{
+    if(!_is_initialised)
+    {
+        std::vector<cl::Platform> platforms;
+        cl::Platform::get(&platforms);
+        ARM_COMPUTE_ERROR_ON_MSG(platforms.size() == 0, "Couldn't find any OpenCL platform");
+        cl::Platform            p = platforms[0];
+        cl::Context             ctx;
+        cl::Device              device;
+        std::vector<cl::Device> platform_devices;
+        p.getDevices(CL_DEVICE_TYPE_DEFAULT, &platform_devices);
+        ARM_COMPUTE_ERROR_ON_MSG(platform_devices.size() == 0, "Couldn't find any OpenCL device");
+        device = platform_devices[0];
+#if defined(ARM_COMPUTE_DEBUG_ENABLED)
+
+        // Query devices in the context for cl_arm_printf support
+        if(device_supports_extension(device, "cl_arm_printf"))
+        {
+            // Create a cl_context with a printf_callback and user specified buffer size.
+            cl_context_properties properties[] =
+            {
+                CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(p()),
+                // Enable a printf callback function for this context.
+                CL_PRINTF_CALLBACK_ARM, reinterpret_cast<cl_context_properties>(printf_callback),
+                // Request a minimum printf buffer size of 4MB for devices in the
+                // context that support this extension.
+                CL_PRINTF_BUFFERSIZE_ARM, 0x1000,
+                0
+            };
+            ctx = cl::Context(device, properties);
+        }
+        else
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
+        {
+            cl_context_properties properties[] =
+            {
+                CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(p()),
+                0
+            };
+            ctx = cl::Context(device, properties);
+        };
+
+        cl::CommandQueue queue = cl::CommandQueue(ctx, device);
+        CLKernelLibrary::get().init("./cl_kernels/", ctx, device);
+        init(ctx, queue, device, cl_tuner);
+
+        // Create a default static tuner and set if none was provided
+        _cl_default_static_tuner = tuners::TunerFactory::create_tuner(_target);
+    }
+
+    // Set CL tuner
+    _cl_tuner = (cl_tuner == nullptr) ? _cl_default_static_tuner.get() : cl_tuner;
+}
+
 void CLScheduler::enqueue(ICLKernel &kernel, bool flush)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised,
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index 54e7c5b..dd716f7 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -74,8 +74,16 @@
 
     if(_associated_memory_group == nullptr)
     {
-        ARM_COMPUTE_ERROR_ON(_memory.region()->cl_data().get() != nullptr);
-        _memory = CLMemory(allocate_region(CLScheduler::get().context(), info().total_size(), 0));
+        if(_memory.region()->cl_data().get() != nullptr)
+        {
+            // Memory is already allocated. Reuse it if big enough, otherwise fire an assertion
+            ARM_COMPUTE_ERROR_ON_MSG(info().total_size() > _memory.region()->size(), "Reallocation of a bigger memory region is not allowed!");
+        }
+        else
+        {
+            // Perform memory allocation
+            _memory = CLMemory(allocate_region(CLScheduler::get().context(), info().total_size(), 0));
+        }
     }
     else
     {
diff --git a/src/runtime/CL/functions/CLArithmeticDivision.cpp b/src/runtime/CL/functions/CLArithmeticDivision.cpp
new file mode 100644
index 0000000..1c2849c
--- /dev/null
+++ b/src/runtime/CL/functions/CLArithmeticDivision.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLArithmeticDivision.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLArithmeticDivisionKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticDivisionKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+
+    if(output->info()->dimension(0) > 1)
+    {
+        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+        if(broadcasted_info->info()->dimension(0) == 1)
+        {
+            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+        }
+    }
+}
+
+Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return CLArithmeticDivisionKernel::validate(input1, input2, output);
+}
diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
index 5acb8e7..84e8709 100644
--- a/src/runtime/CL/functions/CLCannyEdge.cpp
+++ b/src/runtime/CL/functions/CLCannyEdge.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,16 +50,22 @@
       _visited(),
       _recorded(),
       _l1_list_counter(),
-      _l1_stack()
+      _l1_stack(),
+      _output(nullptr)
 {
 }
 
-void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value)
+void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode,
+                            uint8_t constant_border_value)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON((1 != norm_type) && (2 != norm_type));
-    ARM_COMPUTE_ERROR_ON(lower_thr > upper_thr);
+    ARM_COMPUTE_ERROR_ON((gradient_size != 3) && (gradient_size != 5) && (gradient_size != 7));
+    ARM_COMPUTE_ERROR_ON((lower_thr < 0) || (lower_thr >= upper_thr));
+
+    _output = output;
 
     const unsigned int L1_hysteresis_stack_size = 8;
     const TensorShape  shape                    = input->info()->tensor_shape();
@@ -122,7 +128,7 @@
     }
     else
     {
-        ARM_COMPUTE_ERROR("Gradient %d size not supported", gradient_size);
+        ARM_COMPUTE_ERROR("Gradient size %d not supported", gradient_size);
     }
 
     // Manage intermediate buffers
@@ -187,6 +193,7 @@
     CLScheduler::get().enqueue(_non_max_suppr, false);
 
     // Clear temporary structures and run edge trace
+    _output->clear(CLScheduler::get().queue());
     _visited.clear(CLScheduler::get().queue());
     _recorded.clear(CLScheduler::get().queue());
     _l1_list_counter.clear(CLScheduler::get().queue());
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
new file mode 100644
index 0000000..018c674
--- /dev/null
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
+
+#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
+#include "arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+CLConcatenateLayer::CLConcatenateLayer()
+    : _concat_function(nullptr)
+{
+}
+
+void CLConcatenateLayer::configure(const std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, DataLayoutDimension axis)
+{
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    switch(get_data_layout_dimension_index(output->info()->data_layout(), axis))
+    {
+        case 0:
+        {
+            auto func = support::cpp14::make_unique<CLWidthConcatenateLayer>();
+            func->configure(inputs_vector, output);
+            _concat_function = std::move(func);
+            break;
+        }
+        case 2:
+        {
+            auto func = support::cpp14::make_unique<CLDepthConcatenateLayer>();
+            func->configure(inputs_vector, output);
+            _concat_function = std::move(func);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Concatenation is supported across width and depth only!");
+    }
+}
+
+Status CLConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, DataLayoutDimension axis)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(output == nullptr);
+
+    switch(get_data_layout_dimension_index(output->data_layout(), axis))
+    {
+        case 0:
+            ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayer::validate(inputs_vector, output));
+            break;
+        case 2:
+            ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConcatenateLayer::validate(inputs_vector, output));
+            break;
+        default:
+            ARM_COMPUTE_RETURN_ERROR_MSG("Concatenation is supported across width and depth only!");
+    }
+    return Status{};
+}
+
+void CLConcatenateLayer::run()
+{
+    ARM_COMPUTE_ERROR_ON(_concat_function == nullptr);
+    _concat_function->run();
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 47a8d5f..0014e71 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -43,17 +43,18 @@
 }
 
 void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
+                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
-                                                            enable_fast_math));
+                                                            enable_fast_math, num_groups));
 
     switch(CLConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info,
                                                       weights_info, act_info, CLScheduler::get().target(), dilation, enable_fast_math))
     {
         case ConvolutionMethod::WINOGRAD:
         {
+            ARM_COMPUTE_ERROR_ON(num_groups != 1);
             auto f = arm_compute::support::cpp14::make_unique<CLWinogradConvolutionLayer>(_memory_manager);
             f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math);
             _function = std::move(f);
@@ -61,6 +62,7 @@
         }
         case ConvolutionMethod::DIRECT:
         {
+            ARM_COMPUTE_ERROR_ON(num_groups != 1);
             auto f = arm_compute::support::cpp14::make_unique<CLDirectConvolutionLayer>();
             f->configure(input, weights, biases, output, conv_info, act_info);
             _function = std::move(f);
@@ -69,7 +71,7 @@
         case ConvolutionMethod::GEMM:
         {
             auto f = arm_compute::support::cpp14::make_unique<CLGEMMConvolutionLayer>(_memory_manager);
-            f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info);
+            f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups);
             _function = std::move(f);
             break;
         }
@@ -80,9 +82,10 @@
 }
 
 Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
+                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
 
     const GPUTarget gpu_target = CLScheduler::get().target();
 
@@ -91,19 +94,21 @@
         case ConvolutionMethod::WINOGRAD:
         {
             //Validate Winograd
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups != 1, "Grouping (num_groups != 1) with CLWinogradConvolutionLayer is not supported");
             ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
             break;
         }
         case ConvolutionMethod::DIRECT:
         {
             // Validate direct convolution layer
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups != 1, "Grouping (num_groups != 1) with CLDirectConvolutionLayer is not supported");
             ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
             break;
         }
         case ConvolutionMethod::GEMM:
         {
             // Validate gemm-based convolution layer
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups));
             break;
         }
         default:
@@ -123,8 +128,47 @@
     ARM_COMPUTE_UNUSED(weights_info);
     ARM_COMPUTE_UNUSED(gpu_target);
 
+    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
     const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
 
+    /* Input spatial dims, kernel size, IFM/OFM, conv info*/
+    using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo, DataLayout>;
+    using ConfigurationMethod      = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
+
+    const std::vector<ConfigurationMethod> known_configs =
+    {
+        // Alexnet
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
+        // VGG16 / VGG19
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
+        // Mobilenet 224
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
+        // Mobilenet 160
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
+        // Mobilenet 224
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
+        // Mobilenet 160
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
+    };
+
+    const auto find_config = [&](ConfigurationMethod c)
+    {
+        const ConvolutionConfiguration config      = c.first;
+        const PadStrideInfo            info        = std::get<3>(config);
+        const DataLayout               data_layout = std::get<4>(config);
+
+        return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
+               && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
+               && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride() && (data_layout == input->data_layout());
+    };
+
+    std::vector<ConfigurationMethod>::const_iterator found;
+    if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
+    {
+        return (*found).second;
+    }
+
     if(dilation != Size2D(1U, 1U) || (input->dimension(idx_c) < 16))
     {
         return ConvolutionMethod::GEMM;
diff --git a/src/runtime/CL/functions/CLCopy.cpp b/src/runtime/CL/functions/CLCopy.cpp
index 3442e37..d1b7926 100644
--- a/src/runtime/CL/functions/CLCopy.cpp
+++ b/src/runtime/CL/functions/CLCopy.cpp
@@ -41,3 +41,8 @@
     k->configure(input, output);
     _kernel = std::move(k);
 }
+
+Status CLCopy::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output)
+{
+    return CLCopyKernel::validate(input, output);
+}
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index cb8dc02..40562b5 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -38,15 +38,16 @@
     : _memory_group(std::move(memory_manager)),
       _scale_f(),
       _conv_f(),
-      _scaled_output()
+      _scaled_output(),
+      _is_prepared(false)
 {
 }
 
 Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
-                                      unsigned int inner_border_right, unsigned int inner_border_top)
+                                      unsigned int inner_border_right, unsigned int inner_border_top, const WeightsInfo &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->dimension(1));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
@@ -63,12 +64,10 @@
     const TensorShape output_shape = deconvolution_output_shape(out_dims, input->tensor_shape(), weights->tensor_shape());
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, weights);
 
     if(bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, bias);
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
@@ -80,13 +79,13 @@
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, BorderSize(inner_border_right, inner_border_top), info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, WeightsInfo()));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
 
     return Status{};
 }
 
 void CLDeconvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
-                                     unsigned int inner_border_right, unsigned int inner_border_top)
+                                     unsigned int inner_border_right, unsigned int inner_border_top, const WeightsInfo &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
@@ -94,16 +93,18 @@
     const unsigned int stride_y = info.stride().second;
 
     auto out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
-                                                    info.pad().first, info.pad().second, inner_border_top, inner_border_right, stride_x, stride_y);
+                                                    info.pad().first, info.pad().second, inner_border_right, inner_border_top, stride_x, stride_y);
 
     const TensorShape output_shape = deconvolution_output_shape(out_dims, input->info()->tensor_shape(), weights->info()->tensor_shape());
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, inner_border_right, inner_border_top));
 
+    _is_prepared = false;
+
     _memory_group.manage(&_scaled_output);
 
     // configure scale function
@@ -113,21 +114,34 @@
     const unsigned int out_y = input->info()->dimension(1) + (input->info()->dimension(1) - 1) * (stride_y - 1) + inner_border_top + 2 * info.pad().second;
     scale_out_shape.set(0, out_x);
     scale_out_shape.set(1, out_y);
-    TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type());
     _scaled_output.allocator()->init(scale_out_info);
 
     _scale_f.configure(input, &_scaled_output, BorderSize(inner_border_top, inner_border_right), info);
 
     // setup the function to convolve the upscaled output
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-    _conv_f.configure(&_scaled_output, weights, bias, output, conv_info);
+    _conv_f.configure(&_scaled_output, weights, bias, output, conv_info, weights_info);
     _scaled_output.allocator()->allocate();
 }
 
 void CLDeconvolutionLayer::run()
 {
+    prepare();
+
     _memory_group.acquire();
+
     _scale_f.run();
     _conv_f.run();
+
     _memory_group.release();
 }
+
+void CLDeconvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        _conv_f.prepare();
+        _is_prepared = true;
+    }
+}
diff --git a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
index 26d46a4..b5e8fd9 100644
--- a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
@@ -27,7 +27,9 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/ToolchainSupport.h"
 
@@ -41,22 +43,26 @@
 {
 }
 
-void CLDepthConcatenateLayer::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output) // NOLINT
+void CLDepthConcatenateLayer::configure(const std::vector<ICLTensor *> &inputs_vector, ICLTensor *output) // NOLINT
 {
-    ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
-
     _num_inputs = inputs_vector.size();
 
-    unsigned int depth_offset = 0;
+    std::vector<ITensorInfo *> inputs_vector_info;
+    for(unsigned int i = 0; i < _num_inputs; i++)
+    {
+        inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
+    }
 
     _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<CLDepthConcatenateLayerKernel[]>(_num_inputs);
     _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
 
-    TensorShape output_shape = calculate_depth_concatenate_shape(inputs_vector);
+    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(inputs_vector_info);
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type(), inputs_vector[0]->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
+    ARM_COMPUTE_ERROR_THROW_ON(CLDepthConcatenateLayer::validate(inputs_vector_info, output->info()));
 
+    unsigned int depth_offset = 0;
     for(unsigned int i = 0; i < _num_inputs; i++)
     {
         _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
@@ -69,6 +75,27 @@
     output->info()->set_valid_region(ValidRegion(Coordinates(), output_shape));
 }
 
+Status CLDepthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
+
+    // Output auto inizialitation if not yet initialized
+    TensorInfo  tmp_output_info = *output->clone();
+    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(inputs_vector);
+    auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
+
+    unsigned int depth_offset = 0;
+    for(const auto &input : inputs_vector)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConcatenateLayerKernel::validate(input, depth_offset, &tmp_output_info));
+        depth_offset += input->dimension(2);
+    }
+
+    return Status{};
+}
+
 void CLDepthConcatenateLayer::run()
 {
     cl::CommandQueue q = CLScheduler::get().queue();
diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
index b448465..2e52e8a 100644
--- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,3 +36,8 @@
     k->configure(input, output, policy, shift);
     _kernel = std::move(k);
 }
+
+Status CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
+{
+    return CLDepthConvertLayerKernel::validate(input, output, policy, shift);
+}
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 676a121..76451af 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -73,7 +73,7 @@
                                                 ActivationLayerInfo act_info, GPUTarget gpu_target)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW && input->data_layout() != DataLayout::NHWC);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
 
     if(input->data_layout() == DataLayout::NCHW)
     {
@@ -91,7 +91,7 @@
 
 CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer()
     : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
-      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr)
+      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _original_weights(nullptr)
 {
 }
 
@@ -99,12 +99,17 @@
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
 
-    const size_t weights_w = weights->info()->dimension(0);
-    const size_t weights_h = weights->info()->dimension(1);
-    const size_t weights_z = weights->info()->dimension(2);
+    const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    const size_t idx_c = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
 
-    _is_first_run     = true;
+    const size_t weights_w = weights->info()->dimension(idx_w);
+    const size_t weights_h = weights->info()->dimension(idx_h);
+    const size_t weights_z = weights->info()->dimension(idx_c);
+
+    _is_prepared      = false;
     _original_weights = weights;
     _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
 
@@ -119,8 +124,8 @@
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
 
     // Output width and height
-    const unsigned int conv_w = output_shape.x();
-    const unsigned int conv_h = output_shape.y();
+    const unsigned int conv_w = output_shape[idx_w];
+    const unsigned int conv_h = output_shape[idx_h];
 
     // Set up intermediate tensors
     const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
@@ -134,6 +139,7 @@
     _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
     _im2col_kernel.set_target(gpu_target);
     _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
+    CLScheduler::get().tune_kernel_static(_im2col_kernel);
 
     // Weights reshape configuration
     const TensorShape shape_weights_reshape(patch_size, weights_z);
@@ -149,6 +155,7 @@
     _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
     _v2mm_kernel.set_target(gpu_target);
     _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
+    CLScheduler::get().tune_kernel_static(_v2mm_kernel);
     _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
     _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
 
@@ -180,24 +187,27 @@
 
     // Allocate intermediate tensors
     _input_reshaped.allocator()->allocate();
-    _weights_reshaped.allocator()->allocate();
     _v2mm_output.allocator()->allocate();
 }
 
 Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
                                              unsigned int depth_multiplier)
 {
+    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+    const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(2) * depth_multiplier) != weights->dimension(2));
+    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(idx_c) * depth_multiplier) != weights->dimension(idx_c));
 
     const bool         is_quantized = is_data_type_quantized_asymmetric(input->data_type());
     const bool         append_bias  = (biases != nullptr) && !is_quantized;
     const TensorShape  output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
-    const size_t       weights_w    = weights->dimension(0);
-    const size_t       weights_h    = weights->dimension(1);
-    const size_t       weights_z    = weights->dimension(2);
-    const unsigned int conv_w       = output_shape.x();
-    const unsigned int conv_h       = output_shape.y();
+    const size_t       weights_w    = weights->dimension(idx_w);
+    const size_t       weights_h    = weights->dimension(idx_h);
+    const size_t       weights_z    = weights->dimension(idx_c);
+    const unsigned int conv_w       = output_shape[idx_w];
+    const unsigned int conv_h       = output_shape[idx_h];
     const size_t       patch_size   = weights_w * weights_h + ((append_bias) ? 1 : 0);
     const size_t       conv_size    = conv_w * conv_h;
 
@@ -233,18 +243,7 @@
 
 void CLDepthwiseConvolutionLayer::run()
 {
-    // Run weights reshaping (Runs once for every configure)
-    if(_is_first_run)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        CLScheduler::get().enqueue(_weights_reshape_kernel);
-        CLScheduler::get().enqueue(_v2mm_weights_fill_border);
-        _is_first_run = false;
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
-    }
+    prepare();
 
     CLScheduler::get().enqueue(_im2col_kernel);
     CLScheduler::get().enqueue(_v2mm_input_fill_border);
@@ -255,3 +254,20 @@
         CLScheduler::get().enqueue(_output_stage_kernel);
     }
 }
+
+void CLDepthwiseConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run weights reshaping and mark original weights tensor as unused
+        _weights_reshaped.allocator()->allocate();
+        CLScheduler::get().enqueue(_weights_reshape_kernel);
+        CLScheduler::get().enqueue(_v2mm_weights_fill_border);
+        _original_weights->mark_as_unused();
+
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
+}
diff --git a/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
index af2c6f0..fa2c3af 100644
--- a/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,6 +45,14 @@
 
 void CLDepthwiseSeparableConvolutionLayer::run()
 {
+    prepare();
+
     _depthwise_conv.run();
     _pointwise_conv.run();
+}
+
+void CLDepthwiseSeparableConvolutionLayer::prepare()
+{
+    _depthwise_conv.prepare();
+    _pointwise_conv.prepare();
 }
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp
index 9f571b2..b372c35 100644
--- a/src/runtime/CL/functions/CLFlattenLayer.cpp
+++ b/src/runtime/CL/functions/CLFlattenLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,15 +23,21 @@
  */
 #include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
-#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 void CLFlattenLayer::configure(const ICLTensor *input, ICLTensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLIm2ColKernel>();
-    k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+    auto k = arm_compute::support::cpp14::make_unique<CLFlattenLayerKernel>();
+    k->configure(input, output);
     _kernel = std::move(k);
+    CLScheduler::get().tune_kernel_static(*_kernel);
 }
+
+Status CLFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return CLFlattenLayerKernel::validate(input, output);
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 151fa1b..010985d 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -73,12 +73,12 @@
 }
 
 CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _im2col_kernel(), _reshape_weights_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _accumulate_biases_kernel(),
-      _im2col_output(), _gemmlowp_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false), _original_weights(nullptr)
+    : _memory_group(memory_manager), _convert_weights(), _flatten_layer(), _reshape_weights_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(),
+      _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(), _converted_weights_output(), _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true),
+      _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false), _is_prepared(false), _original_weights(nullptr)
 {
 }
-
-void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
+void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool retain_internal_weights)
 {
     if(_is_quantized)
     {
@@ -100,40 +100,41 @@
     else
     {
         // Configure matrix multiply kernel
-        _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */));
+        _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */, 1, false, retain_internal_weights));
     }
 }
 
-void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
+void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool retain_internal_weights)
 {
     ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
 
     // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
 
-    // Initialize output tensor for im2col
-    TensorShape shape_im2col = compute_im2col_fc_shape(input->info());
-    _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+    // Initialize output tensor for flatten
+    TensorShape shape_flatten = compute_flatten_shape(input->info());
+    _flatten_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten).set_data_layout(DataLayout::NCHW));
 
-    // Configure im2col kernel
-    _memory_group.manage(&_im2col_output);
-    _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+    // Configure flatten kernel
+    _memory_group.manage(&_flatten_output);
+    _flatten_layer.configure(input, &_flatten_output);
 
     // Configure matrix multiply kernel
-    configure_mm(&_im2col_output, weights, output);
+    configure_mm(&_flatten_output, weights, output, retain_internal_weights);
 
-    // Allocate the output tensor for im2col once all the configure methods have been called
-    _im2col_output.allocator()->allocate();
+    // Allocate the output tensor for flatten once all the configure methods have been called
+    _flatten_output.allocator()->allocate();
 }
 
-void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
+void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool retain_internal_weights)
 {
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
 
     // Configure matrix multiply kernel
-    configure_mm(input, weights, output);
+    configure_mm(input, weights, output, retain_internal_weights);
 }
 
-void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights, bool are_weights_reshaped)
+void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+                                      FullyConnectedLayerInfo fc_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
@@ -142,14 +143,15 @@
                                                                weights->info(),
                                                                biases != nullptr ? biases->info() : nullptr,
                                                                output->info(),
-                                                               transpose_weights,
-                                                               are_weights_reshaped));
+                                                               fc_info));
 
-    _are_weights_reshaped = transpose_weights ? are_weights_reshaped : true;
-    _is_fc_after_conv     = true;
-    _accumulate_biases    = false;
-    _is_quantized         = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _original_weights     = weights;
+    _are_weights_converted = true;
+    _are_weights_reshaped  = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+    _is_fc_after_conv      = true;
+    _accumulate_biases     = false;
+    _is_quantized          = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _is_prepared           = fc_info.retain_internal_weights;
+    _original_weights      = weights;
 
     // Configure gemmlowp output
     if(_is_quantized)
@@ -169,25 +171,16 @@
         _accumulate_biases_kernel.configure(output, biases);
     }
 
+    const ICLTensor *weights_to_use = weights;
+
     // With the Fully Connected layer we can have 4 different cases:
     //  1) Convolution layer -> Fully Connected layer without batches
     //  2) Fully Connected layer -> Fully Connected layer without batches
     //  3) Convolution layer -> Fully Connected layer with batches
     //  4) Fully Connected layer -> Fully Connected layer with batches
 
-    const ICLTensor *weights_to_use = weights;
-
-    if(!_are_weights_reshaped)
-    {
-        weights_to_use = &_reshape_weights_output;
-
-        // Reshape the weights
-        _reshape_weights_kernel.configure(weights, &_reshape_weights_output);
-    }
-
     // Check if we have a fully connected layer with batches
     const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
-
     if(is_batched_fc_layer)
     {
         _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
@@ -199,16 +192,38 @@
         _is_fc_after_conv = input->info()->num_dimensions() > 1;
     }
 
+    // Reshape weights if needed
+    if(!_are_weights_reshaped)
+    {
+        // Reshape the weights
+        _reshape_weights_kernel.configure(weights, &_reshape_weights_output);
+        weights_to_use = &_reshape_weights_output;
+    }
+
+    // Convert weights if needed
+    if(_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout))
+    {
+        // Convert weights
+        _convert_weights.configure(weights_to_use,
+                                   &_converted_weights_output,
+                                   input->info()->tensor_shape(),
+                                   fc_info.weights_trained_layout);
+
+        weights_to_use         = &_converted_weights_output;
+        _are_weights_converted = false;
+    }
+
+    // Configure fc core
     ICLTensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output;
     if(_is_fc_after_conv)
     {
         // Fully Connected layer after a Convolution Layer without batches
-        configure_conv_fc(input, weights_to_use, tmp_output);
+        configure_conv_fc(input, weights_to_use, tmp_output, fc_info.retain_internal_weights);
     }
     else
     {
         // Fully Connected layer after a Fully Connected Layer without batches
-        configure_fc_fc(input, weights_to_use, tmp_output);
+        configure_fc_fc(input, weights_to_use, tmp_output, fc_info.retain_internal_weights);
     }
 
     // Configure output stage for asymmetric quantized types
@@ -222,21 +237,23 @@
     }
 }
 
-Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose_weights, bool are_weights_reshaped)
+Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+                                       FullyConnectedLayerInfo fc_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
 
-    bool            weights_reshaped = transpose_weights ? are_weights_reshaped : true;
+    bool            weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
     bool            is_fc_after_conv = true;
     bool            is_quantized     = is_data_type_quantized_asymmetric(input->data_type());
     const GPUTarget gpu_target       = CLScheduler::get().target();
 
-    const ITensorInfo &im2col_input     = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_fc_shape(input)));
-    const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
-    const ITensorInfo &gemmlowp_output  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+    const ITensorInfo &flatten_input     = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input)).set_data_layout(DataLayout::NCHW));
+    const ITensorInfo &reshaped_weights  = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
+    const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
+    const ITensorInfo &gemmlowp_output   = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
 
     // Configure accumulate biases kernel for non quantized asymmetric types
     if(biases != nullptr && !is_quantized)
@@ -255,16 +272,8 @@
     const ITensorInfo *weights_to_use = weights;
     const ITensorInfo *tmp_output     = (is_quantized) ? &gemmlowp_output : output;
 
-    if(!weights_reshaped)
-    {
-        // Validate reshape weights kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
-        weights_to_use = &reshaped_weights;
-    }
-
     // Check if we have a fully connected layer with batches
     const bool is_batched_fc_layer = output->dimension(1) > 1;
-
     if(is_batched_fc_layer)
     {
         is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->tensor_shape().cbegin() + 3,
@@ -276,14 +285,31 @@
         is_fc_after_conv = input->num_dimensions() > 1;
     }
 
+    if(!weights_reshaped)
+    {
+        // Validate reshape weights kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
+        weights_to_use = &reshaped_weights;
+    }
+
+    if(is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout))
+    {
+        // Validate convert weights kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLConvertFullyConnectedWeights::validate(weights_to_use,
+                                                                             &converted_weights,
+                                                                             input->tensor_shape(),
+                                                                             fc_info.weights_trained_layout));
+        weights_to_use = &converted_weights;
+    }
+
     if(is_fc_after_conv)
     {
         // Fully Connected layer after a Convolution Layer without batches
         ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2))));
 
-        // Validate im2col kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_input, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false));
-        input_to_use = &im2col_input;
+        // Validate flatten kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayer::validate(input, &flatten_input));
+        input_to_use = &flatten_input;
     }
     else
     {
@@ -311,7 +337,7 @@
     // Linearize input if it comes from a convolutional layer
     if(_is_fc_after_conv)
     {
-        CLScheduler::get().enqueue(_im2col_kernel, false);
+        _flatten_layer.run();
     }
 
     // Run matrix multiply
@@ -342,27 +368,57 @@
 
 void CLFullyConnectedLayer::prepare()
 {
-    // Reshape of the weights (happens only once)
-    if(!_are_weights_reshaped)
+    if(!_is_prepared)
     {
         ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
-        // Run reshape weights kernel and mark weights as unused
-        _reshape_weights_output.allocator()->allocate();
-        _reshape_weights_kernel.run();
-        _original_weights->mark_as_unused();
+        auto release_unused = [](CLTensor * w)
+        {
+            if(!w->is_used())
+            {
+                CLScheduler::get().queue().finish();
+                w->allocator()->free();
+            }
+        };
+
+        // Pointer to current weights
+        const ICLTensor *cur_weights = _original_weights;
+
+        // Reshape of the weights if needed (happens only once)
+        if(!_are_weights_reshaped)
+        {
+            // Run reshape weights kernel and mark weights as unused
+            _reshape_weights_output.allocator()->allocate();
+            _reshape_weights_kernel.run();
+
+            cur_weights->mark_as_unused();
+            cur_weights           = &_reshape_weights_output;
+            _are_weights_reshaped = true;
+        }
+
+        // Convert weights if needed (happens only once)
+        if(!_are_weights_converted)
+        {
+            _converted_weights_output.allocator()->allocate();
+            _convert_weights.run();
+
+            cur_weights->mark_as_unused();
+            _are_weights_converted = true;
+        }
+
+        // Release reshaped weights if unused
+        release_unused(&_reshape_weights_output);
 
         // Prepare GEMM prepare and release unused weights
         if(!_is_quantized)
         {
             _mm_gemm.prepare();
-            if(!_reshape_weights_output.is_used())
-            {
-                _reshape_weights_output.allocator()->free();
-            }
         }
 
-        CLScheduler::get().queue().finish();
-        _are_weights_reshaped = true;
+        // Release converted weights if unused
+        release_unused(&_reshape_weights_output);
+        release_unused(&_converted_weights_output);
+
+        _is_prepared = true;
     }
 }
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index f81da6c..f16d1c0 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -24,10 +24,6 @@
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/Helpers.h"
@@ -48,13 +44,16 @@
 {
     bool flag = true;
 
-    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX))
+    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
     {
-        // COMPMID-852
         if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
         {
-            const float scale = k < 1024 ? 2.0f : 2.5f;
-            flag              = (scale * n) > ((1.66f * n) + 38.4f);
+            constexpr float alpha = 3.2f;
+            constexpr float fact0 = 1.51f;
+            constexpr float fact1 = 1.66f;
+            constexpr float ops   = 12.0f;
+            const float     scale = k > 1024 ? 1.07f : 1.0f;
+            flag                  = alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops);
         }
         else
         {
@@ -84,12 +83,10 @@
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info));
 
-    // Store original b matrix
-    _original_b = b;
-
     // Check if we need to reshape the matrix B only on the first run
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
-    _is_prepared                 = false;
+    _is_prepared                 = gemm_info.retain_internal_weights();
+    _original_b                  = b;
 
     const ICLTensor *matrix_a = a;
     const ICLTensor *matrix_b = b;
@@ -104,9 +101,11 @@
     // Arguments used by GEMMReshapeInfo
     // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
     // in order to know how the matrices have been reshaped
-    const int m                         = a->info()->dimension(1);
+    bool      reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
+    const int m                         = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
     const int n                         = b->info()->dimension(0);
     const int k                         = a->info()->dimension(0);
+    const int depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
     int       mult_transpose1xW_width   = 1;
     int       mult_interleave4x4_height = 1;
 
@@ -119,6 +118,12 @@
     // Check if we need to reshape the matrix A and matrix B
     _is_interleaved_transposed = is_interleaved_transposed(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
 
+    // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
+    if(_is_interleaved_transposed)
+    {
+        reinterpret_input_as_3d = false;
+    }
+
     if(_is_interleaved_transposed)
     {
         matrix_a = &_tmp_a;
@@ -133,13 +138,16 @@
         // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
 
         // Configure interleave kernel
-        _interleave_kernel.configure(a, &_tmp_a, mult_interleave4x4_height);
+        _interleave_kernel.configure(a, &_tmp_a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d());
 
         // Configure transpose kernel
         _transpose_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
     }
 
-    _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height));
+    // Configure and tune matrix multiply kernel
+    _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d,
+                                                                                                        reinterpret_input_as_3d));
+    CLScheduler::get().tune_kernel_static(_mm_kernel);
 
     if(_is_interleaved_transposed)
     {
@@ -162,6 +170,7 @@
 Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(output);
 
     // Check if we need to reshape the matrix B only on the first run
     const bool reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
@@ -171,7 +180,6 @@
 
     TensorInfo tmp_a_info{};
     TensorInfo tmp_b_info{};
-    TensorInfo tmp_output_info = *output->clone();
 
     // Get the GPU target
     const GPUTarget gpu_target = CLScheduler::get().target();
@@ -179,11 +187,13 @@
     // Arguments used by GEMMReshapeInfo
     // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
     // in order to know how the matrices have been reshaped
-    const int m                         = a->dimension(1);
+    bool      reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
+    const int m                         = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
     const int n                         = b->dimension(0);
     const int k                         = a->dimension(0);
     int       mult_transpose1xW_width   = 1;
     int       mult_interleave4x4_height = 1;
+    const int depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
 
     if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
     {
@@ -191,19 +201,25 @@
         mult_interleave4x4_height = 2;
     }
 
-    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height);
-
     // Check if we need to reshape the matrix A and matrix B
     const bool run_interleave_transpose = is_interleaved_transposed(m, n, k, a->data_type(), reshape_b_only_on_first_run, gpu_target);
 
+    // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
+    if(run_interleave_transpose)
+    {
+        reinterpret_input_as_3d = false;
+    }
+
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d);
+
     if(run_interleave_transpose)
     {
         matrix_a_info = &tmp_a_info;
         matrix_b_info = &tmp_b_info;
 
         // Validate interleave kernel
-        auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height)));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &tmp_a_info, mult_interleave4x4_height));
+        auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &tmp_a_info, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()));
 
         // Validate transpose kernel
         auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
@@ -211,13 +227,12 @@
     }
 
     // Validate matrix multiply
-    auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info, gpu_target));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, alpha, run_interleave_transpose, reshape_info, gpu_target));
 
     if(beta != 0 && c != nullptr)
     {
         // Validate matrix addition kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, &tmp_output_info, beta));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
     }
 
     return Status{};
@@ -259,7 +274,7 @@
     {
         if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
         {
-            // Run transpose kernel
+            // Run transpose kernel and mark original weights tensor as unused
             _tmp_b.allocator()->allocate();
             CLScheduler::get().enqueue(_transpose_kernel, false);
             _original_b->mark_as_unused();
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 79495e4..92d04d6 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -43,42 +43,43 @@
 {
 }
 
-void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output)
+void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayerReshapeWeights::validate(weights->info(),
                                                                           (biases != nullptr) ? biases->info() : nullptr,
-                                                                          output->info()));
+                                                                          output->info(),
+                                                                          num_groups));
 
     const bool       append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
     const ICLTensor *biases_to_use = (append_biases) ? biases : nullptr;
 
-    _weights_reshape_kernel.configure(weights, biases_to_use, output);
+    _weights_reshape_kernel.configure(weights, biases_to_use, output, num_groups);
 
     output->info()->set_quantization_info(weights->info()->quantization_info());
 }
 
-Status CLConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output)
+Status CLConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
 
     if(biases != nullptr)
     {
+        const int idx_kernels = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
         ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(weights->data_type()));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
     }
 
     if((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
 
-        CLWeightsReshapeKernel::validate(weights, biases, output);
+        CLWeightsReshapeKernel::validate(weights, biases, output, num_groups);
     }
 
     return Status{};
@@ -91,14 +92,15 @@
 
 CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
-      _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
+      _add_bias_kernel(), _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _append_bias(false),
+      _skip_im2col(false), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
 {
 }
 
-void CLGEMMConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
+void CLGEMMConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, int gemm_3d_depth)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), output->info(), gemm_3d_depth, _skip_im2col));
 
     if(_is_quantized)
     {
@@ -119,15 +121,16 @@
     else
     {
         // Configure matrix multiply function
-        _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/));
+        _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, gemm_3d_depth,
+                                                                                 _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */));
     }
 }
 
-Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output)
+Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, int gemm_3d_depth, bool skip_im2col)
 {
     const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
 
-    const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */);
+    const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */);
     if(is_quantized)
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
@@ -141,18 +144,17 @@
         weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
 
         // Perform validation step on GEMMLowp
-        CLGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), output, gemm_info);
+        return CLGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), output, gemm_info);
     }
     else
     {
         // Perform validation step on Matrix multiply function
-        CLGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
+        return CLGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
     }
-    return Status{};
 }
 
 void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                       const Size2D &dilation, const ActivationLayerInfo &act_info)
+                                       const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
@@ -163,22 +165,35 @@
                                                                 conv_info,
                                                                 weights_info,
                                                                 dilation,
-                                                                act_info));
+                                                                act_info,
+                                                                num_groups));
 
-    _is_prepared      = false;
+    const DataType   data_type   = input->info()->data_type();
+    const DataLayout data_layout = input->info()->data_layout();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int        idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+    const unsigned int kernel_width  = weights->info()->dimension(idx_width);
+    const unsigned int kernel_height = weights->info()->dimension(idx_height);
+
+    _is_prepared      = weights_info.retain_internal_weights();
     _original_weights = weights;
     _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
-
-    const DataType dt = input->info()->data_type();
+    _data_layout      = data_layout;
+    _skip_im2col      = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1) && !_is_quantized;
+    _append_bias      = (biases != nullptr) && (!_is_quantized);
 
     // Set the GPU target for im2col and col2im
     _im2col_kernel.set_target(CLScheduler::get().target());
     _col2im_kernel.set_target(CLScheduler::get().target());
 
-    const bool append_bias = (biases != nullptr) && (!_is_quantized);
+    bool             is_nhwc                   = _data_layout == DataLayout::NHWC;
+    const ICLTensor *gemm_input_to_use         = input;
+    ICLTensor       *gemm_output_to_use        = output;
+    ICLTensor       *gemm_output_staged_to_use = output;
 
-    const unsigned   bias_element  = (append_bias) ? 1 : 0;
-    const ICLTensor *biases_to_use = (append_bias) ? biases : nullptr;
+    const ICLTensor *biases_to_use = (_append_bias && !_skip_im2col) ? biases : nullptr;
 
     // Get parameters from conv_info
     unsigned int stride_x = 0;
@@ -188,51 +203,66 @@
     // Get convolved dimensions
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(idx_width),
+                                                 input->info()->dimension(idx_height),
+                                                 kernel_width,
+                                                 kernel_height,
+                                                 conv_info,
+                                                 dilation);
 
-    const unsigned int kernel_width  = weights->info()->dimension(0);
-    const unsigned int kernel_height = weights->info()->dimension(1);
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
-                                                 conv_info, dilation);
-
-    unsigned int mat_weights_cols = weights->info()->dimension(3);
-    unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element;
+    unsigned int mat_weights_cols = weights->info()->dimension(idx_kernels) / num_groups;
 
     // _weights_reshaped will be auto configured in the kernel.
     // Just append biases and do not transpose 1xW as it will be reshaped in CLGEMM
-    _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped);
-
-    weights = &_weights_reshaped;
+    _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, num_groups);
 
     // Create tensor to store im2col reshaped inputs
-    const unsigned int mat_input_cols = mat_weights_rows;
-    const unsigned int mat_input_rows = conv_w * conv_h;
-    TensorShape        shape_im2col   = input->info()->tensor_shape();
-    shape_im2col.set(0, mat_input_cols);
-    shape_im2col.set(1, mat_input_rows);
-    shape_im2col.set(2, 1);
-    TensorInfo im2col_reshaped_info(shape_im2col, 1, dt, input->info()->fixed_point_position());
-    im2col_reshaped_info.set_quantization_info(input->info()->quantization_info());
-    _im2col_output.allocator()->init(im2col_reshaped_info);
-    _memory_group.manage(&_im2col_output);
+    if(!_skip_im2col)
+    {
+        _memory_group.manage(&_im2col_output);
+
+        // Configure and tune im2col. im2col output shape is auto-initialized
+        _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, _append_bias, dilation, num_groups);
+
+        // Set quantization info
+        _im2col_output.info()->set_quantization_info(input->info()->quantization_info());
+        CLScheduler::get().tune_kernel_static(_im2col_kernel);
+
+        // Update GEMM input
+        gemm_input_to_use = &_im2col_output;
+    }
+    else if(_append_bias)
+    {
+        // Configure add bias kernel
+        _add_bias_kernel.configure(output, biases, output, ConvertPolicy::SATURATE);
+    }
 
     // Create GEMM output tensor
-    TensorShape shape_gemm = _im2col_output.info()->tensor_shape();
-    shape_gemm.set(0, mat_weights_cols);
-    shape_gemm.set(1, mat_input_rows);
-    const DataType gemm_data_type = _is_quantized ? DataType::S32 : dt;
-    // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
-    TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->info()->fixed_point_position());
-    info_gemm.set_quantization_info(output->info()->quantization_info());
-    _gemm_output.allocator()->init(info_gemm);
-    _memory_group.manage(&_gemm_output);
+    if(!is_nhwc || _is_quantized)
+    {
+        // Calculate GEMM output shape
+        TensorShape shape_gemm = _im2col_output.info()->tensor_shape();
+        shape_gemm.set(0, mat_weights_cols);
+        shape_gemm.set(1, conv_w * conv_h);
 
-    // Configure im2col
-    _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation);
+        // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
+        const DataType gemm_data_type = _is_quantized ? DataType::S32 : data_type;
+        TensorInfo info_gemm(shape_gemm, 1, gemm_data_type);
+        info_gemm.set_quantization_info(output->info()->quantization_info());
+        _gemm_output.allocator()->init(info_gemm);
+        _memory_group.manage(&_gemm_output);
 
-    // Configure GEMM
-    configure_mm(&_im2col_output, weights, &_gemm_output);
+        // Update GEMM output
+        gemm_output_to_use = &_gemm_output;
+    }
 
-    _im2col_output.allocator()->allocate();
+    // Configure and tune GEMM
+    configure_mm(gemm_input_to_use, &_weights_reshaped, gemm_output_to_use, (data_layout == DataLayout::NHWC) ? conv_h : 1);
+
+    if(!_skip_im2col)
+    {
+        _im2col_output.allocator()->allocate();
+    }
 
     // Configure output stage for quantized case
     if(_is_quantized)
@@ -242,19 +272,36 @@
         float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
         int   output_multiplier, output_shift;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
         _memory_group.manage(&_tmp_output);
-        _gemmlowp_output_stage.configure(&_gemm_output, biases, &_tmp_output, output_multiplier, output_shift, output_quant_info.offset);
+        gemm_output_staged_to_use = &_tmp_output;
+
+        _gemmlowp_output_stage.configure(gemm_output_to_use, biases, gemm_output_staged_to_use, output_multiplier, output_shift, output_quant_info.offset);
     }
 
-    // Configure Col2Im
-    _col2im_kernel.configure(_is_quantized ? &_tmp_output : &_gemm_output, output, std::make_pair(conv_w, conv_h));
-    if(_is_quantized)
+    if(!is_nhwc || _is_quantized)
+    {
+        if(input->info()->data_layout() == DataLayout::NCHW)
+        {
+            // Configure and tune Col2Im
+            _col2im_kernel.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output, std::make_pair(conv_w, conv_h), num_groups);
+            CLScheduler::get().tune_kernel_static(_col2im_kernel);
+        }
+        else
+        {
+            // Configure reshape layer
+            _reshape_layer.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output);
+        }
+    }
+
+    if(!is_nhwc || _is_quantized)
     {
         _tmp_output.allocator()->allocate();
+        _gemm_output.allocator()->allocate();
     }
-    _gemm_output.allocator()->allocate();
 
-    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h),
+                             "Output shape does not match the expected one");
 
     //Configure Activation Layer
     _is_activationlayer_enabled = act_info.enabled();
@@ -268,76 +315,42 @@
 }
 
 Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info)
+                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_type() == DataType::QASYMM8), "Grouping (num_groups != 1) is not supported with QASYMM8");
+    ARM_COMPUTE_RETURN_ERROR_ON(((input->dimension(2) / weights->dimension(2)) != num_groups) && (input->data_layout() == DataLayout::NCHW));
+
+    const DataLayout data_layout = input->data_layout();
+    const DataType   data_type   = input->data_type();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const int        idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+    const unsigned int kernel_width  = weights->dimension(idx_width);
+    const unsigned int kernel_height = weights->dimension(idx_height);
+
+    TensorInfo         im2col_reshaped_info, info_gemm, tmp_info, weights_reshaped_info;
+    const ITensorInfo *gemm_input_to_use         = input;
+    const ITensorInfo *gemm_output_to_use        = output;
+    const ITensorInfo *gemm_output_staged_to_use = output;
+    const ITensorInfo *weights_to_use            = weights;
+
+    const bool is_nhwc      = data_layout == DataLayout::NHWC;
+    const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
+    const bool skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1) && !is_quantized;
+    const bool append_bias  = (biases != nullptr) && (!is_quantized);
+
+    ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * num_groups) != input->dimension(idx_channel));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
 
-    if(act_info.enabled())
-    {
-        ARM_COMPUTE_ERROR_ON(act_info.b() > act_info.a());
-    }
-
-    const bool     is_quantized = is_data_type_quantized_asymmetric(input->data_type());
-    const bool     append_bias  = (biases != nullptr) && (!is_quantized);
-    const unsigned bias_element = (append_bias) ? 1 : 0;
-    const DataType dt           = input->data_type();
-
-    // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-
-    const unsigned int kernel_width  = weights->dimension(0);
-    const unsigned int kernel_height = weights->dimension(1);
-
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height, conv_info, dilation);
-
-    unsigned int mat_weights_cols = weights->dimension(3);
-    unsigned int mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + bias_element;
-
-    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, is_quantized ? nullptr : biases, nullptr));
-
-    // Create tensor info for im2col reshaped inputs
-    const unsigned int mat_input_cols = mat_weights_rows;
-    const unsigned int mat_input_rows = conv_w * conv_h;
-    TensorShape        shape_im2col   = input->tensor_shape();
-    shape_im2col.set(0, mat_input_cols);
-    shape_im2col.set(1, mat_input_rows);
-    shape_im2col.set(2, 1);
-    TensorInfo im2col_reshaped_info(shape_im2col, 1, dt, input->fixed_point_position());
-    im2col_reshaped_info.set_quantization_info(input->quantization_info());
-    ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
-
-    // Create GEMM output tensor
-    TensorShape shape_gemm = im2col_reshaped_info.tensor_shape();
-    shape_gemm.set(0, mat_weights_cols);
-    shape_gemm.set(1, mat_input_rows);
-    const DataType gemm_data_type = is_quantized ? DataType::S32 : dt;
-    // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
-    TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->fixed_point_position());
-    info_gemm.set_quantization_info(output->quantization_info());
-
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(&im2col_reshaped_info, weights, &info_gemm));
-    TensorInfo tmp_info(shape_gemm, 1, DataType::QASYMM8, input->fixed_point_position());
-    tmp_info.set_quantization_info(output->quantization_info());
-
-    if(is_quantized)
-    {
-        float multiplier = input->quantization_info().scale * weights->quantization_info().scale / output->quantization_info().scale;
-        int   output_multiplier, output_shift;
-        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-        // Validate output stage for quantized case
-        CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&info_gemm, biases, &tmp_info, output->quantization_info().offset);
-    }
-
-    // Validate Col2Im
-    ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(is_quantized ? &tmp_info : &info_gemm, output, std::make_pair(conv_w, conv_h)));
-
+    // Validate biases
     if(biases != nullptr)
     {
         if(is_quantized)
@@ -348,11 +361,91 @@
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
         }
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
     }
 
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_ERROR_ON(act_info.b() > act_info.a());
+    }
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(idx_width),
+                                                 input->dimension(idx_height),
+                                                 kernel_width,
+                                                 kernel_height,
+                                                 conv_info,
+                                                 dilation);
+
+    unsigned int mat_weights_cols = weights->dimension(idx_kernels) / num_groups;
+
+    // Output tensor auto inizialitation if not yet initialized
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, is_quantized ? nullptr : biases, nullptr, num_groups));
+    weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, (append_bias && !skip_im2col), num_groups), 1, data_type);
+    weights_to_use        = &weights_reshaped_info;
+
+    if(!skip_im2col)
+    {
+        const Size2D kernel_dims(kernel_width, kernel_height);
+
+        // Output tensor auto initialization if not yet initialized
+        TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, append_bias, dilation, num_groups == 1, num_groups);
+
+        auto_init_if_empty(im2col_reshaped_info, input->clone()->set_tensor_shape(expected_output_shape));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, kernel_dims, conv_info, append_bias, dilation, num_groups));
+        gemm_input_to_use = &im2col_reshaped_info;
+    }
+    else if(append_bias)
+    {
+        // Validate add bias kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(output, biases, output, ConvertPolicy::SATURATE));
+    }
+
+    // Create GEMM output tensor
+    if(!is_nhwc || is_quantized)
+    {
+        TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
+        shape_gemm.set(0, mat_weights_cols);
+        shape_gemm.set(1, conv_w * conv_h);
+        const DataType gemm_data_type = is_quantized ? DataType::S32 : data_type;
+        // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
+        info_gemm = TensorInfo(shape_gemm, 1, gemm_data_type);
+        info_gemm.set_quantization_info(output->quantization_info());
+        gemm_output_to_use = &info_gemm;
+    }
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, gemm_output_to_use, (data_layout == DataLayout::NHWC) ? conv_h : 1, skip_im2col));
+
+    if(is_quantized)
+    {
+        float multiplier = input->quantization_info().scale * weights_to_use->quantization_info().scale / output->quantization_info().scale;
+        int   output_multiplier, output_shift;
+        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+        tmp_info = TensorInfo(gemm_output_to_use->tensor_shape(), 1, DataType::QASYMM8);
+        tmp_info.set_quantization_info(output->quantization_info());
+        gemm_output_staged_to_use = &tmp_info;
+
+        // Validate output stage for quantized case
+        CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(gemm_output_to_use, biases, gemm_output_staged_to_use, output->quantization_info().offset);
+    }
+
+    // Validate Col2Im
+    if(!is_nhwc || is_quantized)
+    {
+        if(input->data_layout() == DataLayout::NCHW)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(is_quantized ? gemm_output_staged_to_use : gemm_output_to_use,
+                                                                 output,
+                                                                 std::make_pair(conv_w, conv_h), num_groups));
+        }
+    }
+
     //Validate Activation Layer
     if(act_info.enabled())
     {
@@ -369,7 +462,10 @@
     _memory_group.acquire();
 
     // Run im2col
-    CLScheduler::get().enqueue(_im2col_kernel);
+    if(!_skip_im2col)
+    {
+        CLScheduler::get().enqueue(_im2col_kernel);
+    }
 
     // Runs CLGEMM or CLGEMMLowpMatrixMultiplyCore functions
     if(_is_quantized)
@@ -386,8 +482,23 @@
         _mm_gemm.run();
     }
 
+    if(_skip_im2col && _append_bias)
+    {
+        CLScheduler::get().enqueue(_add_bias_kernel);
+    }
+
     // Reshape output matrix
-    CLScheduler::get().enqueue(_col2im_kernel, false);
+    if(_data_layout == DataLayout::NCHW || _is_quantized)
+    {
+        if(_data_layout == DataLayout::NCHW)
+        {
+            CLScheduler::get().enqueue(_col2im_kernel, false);
+        }
+        else
+        {
+            _reshape_layer.run();
+        }
+    }
 
     //Run Activation Layer if enabled
     if(_is_activationlayer_enabled)
@@ -402,20 +513,18 @@
 {
     if(!_is_prepared)
     {
-        // Run weights reshaping and mark as unused
         ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run weights reshaping and mark original weights tensor as unused
         _weights_reshaped.allocator()->allocate();
         _reshape_weights.run();
         _original_weights->mark_as_unused();
 
-        // Run GEMM prepare
-        if(!_is_quantized)
+        // Prepare GEMM
+        _is_quantized ? _mm_gemmlowp.prepare() : _mm_gemm.prepare();
+        if(!_weights_reshaped.is_used())
         {
-            _mm_gemm.prepare();
-            if(!_weights_reshaped.is_used())
-            {
-                _weights_reshaped.allocator()->free();
-            }
+            _weights_reshaped.allocator()->free();
         }
 
         CLScheduler::get().queue().finish();
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 711b006..0ce07c3 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -41,9 +41,8 @@
 {
     bool flag = true;
 
-    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX))
+    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76))
     {
-        // COMPMID-852
         if(k > 256 && m > 4 && reshape_b_only_on_first_run)
         {
             flag = ((0.72f + n * 0.10766f) < (n * 0.1284f));
@@ -59,8 +58,23 @@
 } // namespace
 
 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
-      _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _a_offset(0), _b_offset(0), _is_interleaved_transposed(true), _is_first_run(true), _reshape_b_only_on_first_run(false)
+    : _memory_group(std::move(memory_manager)),
+      _mm_kernel(),
+      _mtx_a_reshape_kernel(),
+      _mtx_b_reshape_kernel(),
+      _mtx_a_reduction_kernel(),
+      _mtx_b_reduction_kernel(),
+      _offset_contribution_kernel(),
+      _vector_sum_col(),
+      _vector_sum_row(),
+      _tmp_a(),
+      _tmp_b(),
+      _original_b(nullptr),
+      _a_offset(0),
+      _b_offset(0),
+      _is_interleaved_transposed(true),
+      _reshape_b_only_on_first_run(false),
+      _is_prepared(false)
 {
 }
 
@@ -70,6 +84,8 @@
     ARM_COMPUTE_UNUSED(gemm_info);
     ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));
 
+    _is_prepared                 = false;
+    _original_b                  = b;
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
     _a_offset                    = a->info()->quantization_info().offset;
     _b_offset                    = b->info()->quantization_info().offset;
@@ -149,10 +165,13 @@
     if(_is_interleaved_transposed)
     {
         _tmp_a.allocator()->allocate();
-        _tmp_b.allocator()->allocate();
+        if(!_reshape_b_only_on_first_run)
+        {
+            _tmp_b.allocator()->allocate();
+        }
     }
 
-    if(_a_offset != 0)
+    if(_a_offset != 0 && !_reshape_b_only_on_first_run)
     {
         _vector_sum_col.allocator()->allocate();
     }
@@ -185,16 +204,17 @@
     const int             k                         = a->dimension(0);
     constexpr int         mult_transpose1xW_width   = 1;
     constexpr int         mult_interleave4x4_height = 1;
-    const GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height);
+    const int             depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
+    const GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d);
 
     bool reshape_matrices = is_interleaved_transposed(m, n, k, gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
 
     if(reshape_matrices)
     {
-        TensorInfo info_a(compute_interleaved_shape(*a, mult_interleave4x4_height), 1, a->data_type());
+        TensorInfo info_a(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()), 1, a->data_type());
         TensorInfo info_b(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width), 1, b->data_type());
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &info_a, mult_interleave4x4_height));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &info_a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()));
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &info_b, mult_transpose1xW_width));
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output, reshape_matrices, reshape_info));
     }
@@ -234,6 +254,8 @@
 
 void CLGEMMLowpMatrixMultiplyCore::run()
 {
+    prepare();
+
     _memory_group.acquire();
 
     if(_is_interleaved_transposed)
@@ -241,21 +263,17 @@
         // Run reshape matrix A
         CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false);
 
-        if(_is_first_run || !_reshape_b_only_on_first_run)
+        if(!_reshape_b_only_on_first_run)
         {
             // Run reshape matrix B
             CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
         }
     }
 
-    // Note: if _reshape_b_only_on_first_run = true, the reduction kernel can be executed only once
-    if(_is_first_run || !_reshape_b_only_on_first_run)
+    // Run matrix B reduction kernel only if _a_offset is not equal to 0
+    if(_a_offset != 0 && !_reshape_b_only_on_first_run)
     {
-        // Run matrix B reduction kernel only if _a_offset is not equal to 0
-        if(_a_offset != 0)
-        {
-            CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
-        }
+        CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
     }
 
     // Run matrix multiply
@@ -271,6 +289,30 @@
     CLScheduler::get().enqueue(_offset_contribution_kernel, true);
 
     _memory_group.release();
+}
 
-    _is_first_run = false;
+void CLGEMMLowpMatrixMultiplyCore::prepare()
+{
+    if(!_is_prepared)
+    {
+        if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+            // Run reshape kernel and mark original weights tensor as unused
+            _tmp_b.allocator()->allocate();
+            CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
+            _original_b->mark_as_unused();
+        }
+
+        // Run matrix B reduction kernel only if _a_offset is not equal to 0
+        if(_a_offset != 0 && _reshape_b_only_on_first_run)
+        {
+            _vector_sum_col.allocator()->allocate();
+            CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
+        }
+
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
 }
diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
index ddce5fb..fd82769 100644
--- a/src/runtime/CL/functions/CLGaussianPyramid.cpp
+++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp
@@ -166,7 +166,7 @@
             _gauss5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
 
             /* Configure scale image kernel */
-            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, border_mode == BorderMode::UNDEFINED, SamplingPolicy::CENTER);
+            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, SamplingPolicy::CENTER);
         }
 
         _tmp.allocate();
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index 930d311..3458135 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -38,85 +38,91 @@
 using namespace arm_compute::misc::shape_calculator;
 
 CLLSTMLayer::CLLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _gemm_input_gate1(), _gemm_input_gate2(), _transpose_input_gate1(), _transpose_input_gate2(), _accum_input_gate1(),
-      _accum_input_gate2(), _subtract_input_gate(), _activation_input_gate(), _fully_connected_forget_gate(), _gemm_forget_gate1(), _gemm_forget_gate2(), _transpose_forget_gate1(),
-      _transpose_forget_gate2(), _accum_forget_gate1(), _accum_forget_gate2(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _gemm_cell_state2(), _transpose_cell_state1(),
-      _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), _gemm_output1(),
-      _gemm_output2(), _transpose_output1(), _transpose_output2(), _accum_output1(), _accum_output2(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state(),
-      _fully_connected_output_state(), _gemm_output_state(), _accum_output_state(), _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _input_gate_out1(), _input_gate_out2(),
-      _input_gate_out3(), _input_gate_out4(), _input_gate_out5(), _input_gate_out6(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(),
-      _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _output5(), _output6(),
-      _cell_state_activation(), _output_projection1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false),
-      _perform_projection_clipping(false)
+    : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _gemm_input_gate(), _transpose_input_gate(), _accum_input_gate1(), _accum_input_gate2(), _subtract_input_gate(),
+      _pixelwise_mul_input_gate(), _activation_input_gate(), _fully_connected_forget_gate(), _gemm_forget_gate(), _transpose_forget_gate(), _accum_forget_gate1(), _accum_forget_gate2(),
+      _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _gemm_cell_state2(), _transpose_cell_state(), _accum_cell_state1(), _accum_cell_state2(),
+      _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), _gemm_output(), _pixelwise_mul_output_state1(), _transpose_output(),
+      _accum_output1(), _accum_output2(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _gemm_output_state(), _accum_output_state(),
+      _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _input_gate_out5(),
+      _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(),
+      _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _output5(), _cell_state_activation(), _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false),
+      _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false)
 {
 }
 
-void CLLSTMLayer::configure(const ICLTensor *input, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
+void CLLSTMLayer::configure(const ICLTensor *input,
+                            const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
                             const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
                             const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                            ICLTensor *output_state, ICLTensor *cell_state, ICLTensor *scratch_buffer, ICLTensor *output, const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info,
-                            float cell_threshold, float projection_threshold)
+                            const ICLTensor *output_state_in, const ICLTensor *cell_state_in,
+                            ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
+                            const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                 forget_gate_bias, cell_bias, output_gate_bias, output_state, cell_state);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input,
+                                 input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                                 recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+                                 forget_gate_bias, cell_bias, output_gate_bias,
+                                 output_state_in, cell_state_in,
+                                 scratch_buffer, output_state_out, cell_state_out, output);
+
+    // Set lstm parameters
     LSTMParams<ITensorInfo> lstm_params_info;
     if(lstm_params.has_peephole_opt())
     {
-        lstm_params_info.set_peephole_params(lstm_params.cell_to_input_weights()->info(), lstm_params.cell_to_forget_weights()->info(), lstm_params.cell_to_output_weights()->info());
+        lstm_params_info.set_peephole_params(lstm_params.cell_to_forget_weights()->info(), lstm_params.cell_to_output_weights()->info());
     }
     if(lstm_params.has_projection())
     {
-        lstm_params_info.set_projection_params(lstm_params.projection_weights()->info(), lstm_params.projection_bias()->info());
+        lstm_params_info.set_projection_params(lstm_params.projection_weights()->info(),
+                                               lstm_params.projection_bias() != nullptr ? lstm_params.projection_bias()->info() : nullptr);
     }
     if(!lstm_params.has_cifg_opt())
     {
+        const ITensorInfo *cell_to_input_weights_info = (lstm_params.has_peephole_opt()) ? lstm_params.cell_to_input_weights()->info() : nullptr;
         lstm_params_info.set_cifg_params(lstm_params.input_to_input_weights()->info(), lstm_params.recurrent_to_input_weights()->info(),
-                                         lstm_params.cell_to_input_weights()->info(), lstm_params.input_gate_bias()->info());
+                                         cell_to_input_weights_info, lstm_params.input_gate_bias()->info());
     }
+
+    // Validate
     ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(input->info(), input_to_forget_weights->info(),
                                                      input_to_cell_weights->info(), input_to_output_weights->info(),
                                                      recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
                                                      forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
-                                                     output_state->info(), cell_state->info(), scratch_buffer->info(), output->info(), lstm_params_info,
-                                                     activation_info, cell_threshold, projection_threshold));
+                                                     output_state_in->info(), cell_state_in->info(),
+                                                     scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
+                                                     lstm_params_info, activation_info, cell_threshold, projection_threshold));
 
-    const TensorShape cell_state_shape = cell_state->info()->tensor_shape();
+    const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape();
 
+    // Configure block that calculates the forget gate
+    // forget_gate = Activation(input * input_to_forget_weights + output_state_in * recurrent_to_forget_weights + PixelWiseMul(cell_state, cell_to_forget_weights) + forget_gate_bias)
     TensorShape forget_gate1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
-    TensorShape forget_gate2_shape = compute_transposed_shape(*forget_gate_bias->info());
-    TensorShape forget_gate3_shape{ 1, output_state->info()->dimension(1) };
     _forget_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
     _forget_gate_out2.allocator()->init(TensorInfo(forget_gate1_shape, 1, input->info()->data_type()));
     _forget_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-    _forget_gate_out6.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _forget_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
-    // Configure block that calculates the forget gate
-    // forget_gate = Activation(input * input_to_forget_weights + output_state * recurrent_to_forget_weights + cell_state * cell_to_forget_weights + forget_gate_bias)
     _memory_group.manage(&_forget_gate_out1);
-    _fully_connected_forget_gate.configure(input, input_to_forget_weights, forget_gate_bias, &_forget_gate_out1, true, false);
+    _fully_connected_forget_gate.configure(input, input_to_forget_weights, forget_gate_bias, &_forget_gate_out1);
     _memory_group.manage(&_forget_gate_out2);
-    _transpose_forget_gate1.configure(recurrent_to_forget_weights, &_forget_gate_out2);
+    _transpose_forget_gate.configure(recurrent_to_forget_weights, &_forget_gate_out2);
     _memory_group.manage(&_forget_gate_out3);
-    _gemm_forget_gate1.configure(output_state, &_forget_gate_out2, nullptr, &_forget_gate_out3, 1.f, 0.f);
+    _gemm_forget_gate.configure(output_state_in, &_forget_gate_out2, nullptr, &_forget_gate_out3, 1.f, 0.f);
     _forget_gate_out2.allocator()->allocate();
-    _memory_group.manage(&_forget_gate_out6);
-    _accum_forget_gate1.configure(&_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out6, ConvertPolicy::SATURATE);
-    CLTensor *forget_gate_out = &_forget_gate_out6;
+    _memory_group.manage(&_forget_gate_out5);
+    _accum_forget_gate1.configure(&_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
+    CLTensor *forget_gate_out = &_forget_gate_out5;
 
     if(lstm_params.has_peephole_opt())
     {
-        _forget_gate_out4.allocator()->init(TensorInfo(forget_gate2_shape, 1, input->info()->data_type()));
-        _forget_gate_out5.allocator()->init(TensorInfo(forget_gate3_shape, 1, input->info()->data_type()));
+        _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
         _run_peephole_opt = true;
         _memory_group.manage(&_forget_gate_out4);
-        _transpose_forget_gate2.configure(lstm_params.cell_to_forget_weights(), &_forget_gate_out4);
-        _memory_group.manage(&_forget_gate_out5);
-        _gemm_forget_gate2.configure(cell_state, &_forget_gate_out4, nullptr, &_forget_gate_out5, 1.f, 0.f);
+        _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+        _accum_forget_gate2.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
         _forget_gate_out4.allocator()->allocate();
-        _accum_forget_gate2.configure(&_forget_gate_out6, &_forget_gate_out5, &_forget_gate_out3, ConvertPolicy::SATURATE);
         _forget_gate_out5.allocator()->allocate();
-        _forget_gate_out6.allocator()->allocate();
         forget_gate_out = &_forget_gate_out3;
     }
     else
@@ -126,13 +132,10 @@
     _activation_forget_gate.configure(forget_gate_out, &_forget_gate_out1, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     forget_gate_out->allocator()->allocate();
 
-    TensorShape input_gate3_shape{ 1, output_state->info()->dimension(1) };
-    _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-    _input_gate_out5.allocator()->init(TensorInfo(input_gate3_shape, 1, input->info()->data_type()));
-
     // Configure block that calculates the input gate
-    // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + cell_state * cell_to_input_weights + input_gate_bias), without CIFG
+    // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
     // input_gate = 1 - forget_gate, with CIFG
+    _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
     if(lstm_params.has_cifg_opt())
     {
         _memory_group.manage(&_input_gate_out1);
@@ -143,35 +146,36 @@
     }
     else
     {
-        TensorShape input_gate1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
-        TensorShape input_gate2_shape = compute_transposed_shape(*lstm_params.cell_to_input_weights()->info());
+        TensorShape input_gate_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
 
-        _input_gate_out2.allocator()->init(TensorInfo(input_gate1_shape, 1, input->info()->data_type()));
+        _input_gate_out2.allocator()->init(TensorInfo(input_gate_shape, 1, input->info()->data_type()));
         _input_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-        _input_gate_out4.allocator()->init(TensorInfo(input_gate2_shape, 1, input->info()->data_type()));
-        _input_gate_out6.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+        _input_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+        _input_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
         _memory_group.manage(&_input_gate_out1);
-        _fully_connected_input_gate.configure(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &_input_gate_out1, true, false);
+        _fully_connected_input_gate.configure(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &_input_gate_out1);
         _memory_group.manage(&_input_gate_out2);
-        _transpose_input_gate1.configure(lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
+        _transpose_input_gate.configure(lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
         _memory_group.manage(&_input_gate_out3);
-        _gemm_input_gate1.configure(output_state, &_input_gate_out2, nullptr, &_input_gate_out3, 1.f, 0.f);
+        _gemm_input_gate.configure(output_state_in, &_input_gate_out2, nullptr, &_input_gate_out3, 1.f, 0.f);
         _input_gate_out2.allocator()->allocate();
         _memory_group.manage(&_input_gate_out4);
-        _transpose_input_gate2.configure(lstm_params.cell_to_input_weights(), &_input_gate_out4);
-        _memory_group.manage(&_input_gate_out5);
-        _gemm_input_gate2.configure(cell_state, &_input_gate_out4, nullptr, &_input_gate_out5, 1.f, 0.f);
-        _input_gate_out4.allocator()->allocate();
-        _memory_group.manage(&_input_gate_out6);
-        _accum_input_gate1.configure(&_input_gate_out1, &_input_gate_out3, &_input_gate_out6, ConvertPolicy::SATURATE);
+        _accum_input_gate1.configure(&_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
+        if(_run_peephole_opt)
+        {
+            _memory_group.manage(&_input_gate_out5);
+            _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+            _accum_input_gate2.configure(&_input_gate_out4, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _input_gate_out5.allocator()->allocate();
+        }
         _input_gate_out3.allocator()->allocate();
-        _accum_input_gate2.configure(&_input_gate_out6, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
-        _input_gate_out5.allocator()->allocate();
-        _input_gate_out6.allocator()->allocate();
+        _input_gate_out4.allocator()->allocate();
         _activation_input_gate.configure(&_input_gate_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     }
 
+    // Configure block that calculates the cell state
+    // cell_state = Clip((PixelwiseMul(input_gate, Activation(input * input_to_cell_weights + output_state_in * recurrent_to_cell_weights + cell_bias)) + PixelwiseMul(forget_gate, cell_state)), cell_threshold)
     TensorShape cell_state1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
     _cell_state_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
     _cell_state_out2.allocator()->init(TensorInfo(cell_state1_shape, 1, input->info()->data_type()));
@@ -179,14 +183,12 @@
     _cell_state_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
     _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
-    // Configure block that calculates the cell state
-    // cell_state = Clip((RixelwiseMul(input_gate, Activation(input * input_to_cell_weights + output_state * recurrent_to_cell_weights + cell_bias)) + PixelwiseMul(forget_gate, cell_state)), cell_threshold)
     _memory_group.manage(&_cell_state_out1);
-    _fully_connected_cell_state.configure(input, input_to_cell_weights, cell_bias, &_cell_state_out1, true, false);
+    _fully_connected_cell_state.configure(input, input_to_cell_weights, cell_bias, &_cell_state_out1);
     _memory_group.manage(&_cell_state_out2);
-    _transpose_cell_state1.configure(recurrent_to_cell_weights, &_cell_state_out2);
+    _transpose_cell_state.configure(recurrent_to_cell_weights, &_cell_state_out2);
     _memory_group.manage(&_cell_state_out3);
-    _gemm_cell_state1.configure(output_state, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
+    _gemm_cell_state1.configure(output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
     _cell_state_out2.allocator()->allocate();
     _memory_group.manage(&_cell_state_out4);
     _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
@@ -195,12 +197,11 @@
     _pixelwise_mul_cell_state1.configure(&_cell_state_out4, &_input_gate_out1, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
     _input_gate_out1.allocator()->allocate();
     _cell_state_out4.allocator()->allocate();
-    _pixelwise_mul_cell_state2.configure(&_forget_gate_out1, cell_state, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _pixelwise_mul_cell_state2.configure(&_forget_gate_out1, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
     _forget_gate_out1.allocator()->allocate();
     _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
     _cell_state_out3.allocator()->allocate();
     _cell_state_out5.allocator()->allocate();
-
     // Perform clipping
     if(cell_threshold != 0.f)
     {
@@ -208,53 +209,45 @@
         _cell_clip.configure(&_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold));
     }
 
+    // Configure block that calculates the output
+    // output_state_out = Activation(input * input_to_output_weights + output_state_in * recurrent_to_output_weights + PixelWiseMul(cell_state, cell_to_output_weights) + output_gate_bias)
     TensorShape output1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
-    TensorShape output2_shape = compute_transposed_shape(*cell_bias->info());
-    TensorShape output3_shape{ 1, output_state->info()->dimension(1) };
     _output1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
     _output2.allocator()->init(TensorInfo(output1_shape, 1, input->info()->data_type()));
     _output3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-    _output6.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _output5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
-    // Configure block that calculates the output
-    // output_gate = Activation(input * input_to_output_weights + output_state * recurrent_to_output_weights + cell_state * cell_to_output_weights + output_gate_bias)
     _memory_group.manage(&_output1);
-    _fully_connected_output.configure(input, input_to_output_weights, output_gate_bias, &_output1, true, false);
+    _fully_connected_output.configure(input, input_to_output_weights, output_gate_bias, &_output1);
     _memory_group.manage(&_output2);
-    _transpose_output1.configure(recurrent_to_output_weights, &_output2);
+    _transpose_output.configure(recurrent_to_output_weights, &_output2);
     _memory_group.manage(&_output3);
-    _gemm_output1.configure(output_state, &_output2, nullptr, &_output3, 1.f, 0.f);
+    _gemm_output.configure(output_state_in, &_output2, nullptr, &_output3, 1.f, 0.f);
     _output2.allocator()->allocate();
-    _memory_group.manage(&_output6);
-    _accum_output1.configure(&_output1, &_output3, &_output6, ConvertPolicy::SATURATE);
+    _memory_group.manage(&_output5);
+    _accum_output1.configure(&_output1, &_output3, &_output5, ConvertPolicy::SATURATE);
     _output3.allocator()->allocate();
-    CLTensor *output_gate_out = &_output6;
+    CLTensor *output_gate_out = &_output5;
     if(lstm_params.has_peephole_opt())
     {
-        _output4.allocator()->init(TensorInfo(output2_shape, 1, input->info()->data_type()));
-        _output5.allocator()->init(TensorInfo(output3_shape, 1, input->info()->data_type()));
+        _output4.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
 
         _memory_group.manage(&_output4);
-        _transpose_output2.configure(lstm_params.cell_to_output_weights(), &_output4);
-        _memory_group.manage(&_output5);
-        _gemm_output2.configure(&_cell_state_out1, &_output4, nullptr, &_output5, 1.f, 0.f);
-        _accum_output2.configure(&_output6, &_output5, &_output1, ConvertPolicy::SATURATE);
-        _output6.allocator()->allocate();
+        _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+        _accum_output2.configure(&_output5, &_output4, &_output1, ConvertPolicy::SATURATE);
+        _output5.allocator()->allocate();
         output_gate_out = &_output1;
 
         // Allocate intermediate buffers
         _output4.allocator()->allocate();
-        _output5.allocator()->allocate();
     }
     else
     {
         _output1.allocator()->allocate();
     }
-    _activation_output.configure(output_gate_out, output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     output_gate_out->allocator()->allocate();
 
-    _cell_state_activation.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-
     // Configure block that calculates the output state
     /** lstm_res = PixelwiseMul(output, Activation(cell_state))
      *
@@ -264,32 +257,32 @@
      *                     \
      *                      -- lstm_res , otherwise
      */
+    ICLTensor *output_state_out_tmp = lstm_params.has_projection() ? &_output_state1 : output_state_out;
+    _cell_state_activation.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _output_state1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
     _memory_group.manage(&_cell_state_activation);
     _activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info);
-    _pixelwise_mul_output_state.configure(&_cell_state_activation, output, output_state, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
     _cell_state_activation.allocator()->allocate();
 
     if(lstm_params.has_projection())
     {
         _has_projection_weights = true;
-        _output_projection1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-        _memory_group.manage(&_output_projection1);
-        _fully_connected_output_state.configure(output_state, lstm_params.projection_weights(), lstm_params.projection_bias(), &_output_projection1, true, false);
+        _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out);
+        _output_state1.allocator()->allocate();
         // Perform clipping
         if(projection_threshold != 0.f)
         {
             _perform_projection_clipping = true;
-            _projection_clip.configure(&_output_projection1, output_state, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
+            _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
         }
-
-        // Allocate intermediate buffer
-        _output_projection1.allocator()->allocate();
     }
 
     // Copy cell state and output
-    _copy_cell_state.configure(&_cell_state_out1, cell_state);
+    _copy_cell_state.configure(&_cell_state_out1, cell_state_out);
     _cell_state_out1.allocator()->allocate();
-    _copy_output.configure(output_state, output);
+    _copy_output.configure(output_state_out, output);
 
     // Vector for holding the tensors to store in scratch buffer
     std::vector<ICLTensor *> scratch_inputs;
@@ -303,121 +296,161 @@
     _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
 }
 
-Status CLLSTMLayer::validate(const ITensorInfo *input, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
+Status CLLSTMLayer::validate(const ITensorInfo *input,
+                             const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
                              const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
                              const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                             const ITensorInfo *output_state, const ITensorInfo *cell_state, const ITensorInfo *scratch_buffer, const ITensorInfo *output,
+                             const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
+                             const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
                              const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                        forget_gate_bias, cell_bias, output_gate_bias, output_state, cell_state);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
-                                                       recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state, cell_state);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() != 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(input_to_forget_weights->num_dimensions() != 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(input_to_cell_weights->num_dimensions() != 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_forget_weights->num_dimensions() != 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_cell_weights->num_dimensions() != 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);
-    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->num_dimensions() != 1);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_gate_bias->num_dimensions() != 1);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_state->num_dimensions() != 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(cell_state->num_dimensions() != 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(scratch_buffer->num_dimensions() != 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() != 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input,
+                                        input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+                                        forget_gate_bias, cell_bias, output_gate_bias,
+                                        output_state_in, cell_state_in,
+                                        scratch_buffer, output_state_out, cell_state_out, output);
 
+    // Check data types
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input,
+                                                       input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                                                       recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+                                                       forget_gate_bias, cell_bias, output_gate_bias,
+                                                       output_state_in, cell_state_in,
+                                                       scratch_buffer, output_state_out, cell_state_out, output);
+
+    // Check dimensions
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(input_to_forget_weights->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(input_to_cell_weights->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_forget_weights->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_cell_weights->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->num_dimensions() > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_gate_bias->num_dimensions() > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(cell_state_in->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(scratch_buffer->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0)
+                                && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
+
+    const unsigned int num_batches = input->dimension(1);
+    const unsigned int num_cells   = input_to_output_weights->dimension(1);
+
+    // Check peephole optimization
     if(lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights(), lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() != 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_output_weights()->num_dimensions() != 1);
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_output_weights()->num_dimensions() > 1);
     }
 
     TensorShape      units_out_transposed_shape = compute_transposed_shape(*recurrent_to_output_weights);
-    TensorShape      gemmv_shape{ 1, output_state->dimension(1) };
     TensorShape      num_units_transposed_shape = compute_transposed_shape(*forget_gate_bias);
     const TensorInfo units_out_transposed_info  = TensorInfo(units_out_transposed_shape, 1, input->data_type());
-    const TensorInfo gemmv_shape_info           = TensorInfo(gemmv_shape, 1, input->data_type());
     const TensorInfo num_units_transposed_info  = TensorInfo(num_units_transposed_shape, 1, input->data_type());
 
+    TensorInfo input_gate      = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
+    TensorInfo forget_gate     = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
+    TensorInfo output_gate_tmp = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
+    TensorInfo cell_state_tmp  = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
+
     // Validate forget gate
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, cell_state, true, false));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state, &units_out_transposed_info, nullptr, cell_state, 1.f, 0.f, GEMMInfo()));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(cell_state, cell_state, cell_state, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, &forget_gate));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &forget_gate, 1.f, 0.f, GEMMInfo()));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     if(lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(cell_state, &num_units_transposed_info, nullptr, &gemmv_shape_info, 1.f, 0.f, GEMMInfo()));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(cell_state, &gemmv_shape_info, cell_state, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, cell_state, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate input gate
     if(!lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.cell_to_input_weights(), lstm_params.input_gate_bias());
-        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() != 2);
-        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() != 2);
-        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() != 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() != 1);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), cell_state, true, false));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(cell_state, &num_units_transposed_info, nullptr, &gemmv_shape_info, 1.f, 0.f, GEMMInfo()));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(cell_state, &gemmv_shape_info, cell_state, ConvertPolicy::SATURATE));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
+                                            lstm_params.recurrent_to_input_weights(),
+                                            lstm_params.input_gate_bias());
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2);
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
+
+        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &input_gate));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &input_gate, 1.f, 0.f, GEMMInfo()));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+        if(lstm_params.has_peephole_opt())
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1);
+            ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+        }
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtractionKernel::validate(cell_state, cell_state, cell_state, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtractionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
 
     // Validate cell state
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_cell_weights, cell_bias, cell_state, true, false));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, nullptr, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state, cell_state, cell_state, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_cell_weights, cell_bias, &cell_state_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&cell_state_tmp, nullptr, activation_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
     if(cell_threshold != 0.f)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold,
+                                                                                                                    cell_threshold)));
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, output_gate_bias, cell_state, true, false));
+    // Validate output gate tmp
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, output_gate_bias, &output_gate_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &output_gate_tmp, 1.f, 0.f, GEMMInfo()));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
     if(lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(cell_state, cell_state, cell_state, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
+                                                                              RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate output state
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, cell_state, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state, output, output_state, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
     if(lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(output_state, lstm_params.projection_weights(), lstm_params.projection_bias(), cell_state, true, false));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out));
         if(projection_threshold != 0.f)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, output_state, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold,
-                                                                                                                        projection_threshold)));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(output_state_out, output_state_out,
+                                                                          ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)));
         }
     }
 
-    std::vector<TensorInfo> inputs_vector_info;
+    // Validate copy kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(&cell_state_tmp, cell_state_out));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(output_state_out, output));
+
+    // Validate scratch concatenation
+    std::vector<ITensorInfo *> inputs_vector_info_raw;
     if(lstm_params.has_cifg_opt())
     {
-        inputs_vector_info.emplace_back(*cell_state);
+        inputs_vector_info_raw.push_back(&input_gate);
     }
-    inputs_vector_info.emplace_back(*cell_state);
-    inputs_vector_info.emplace_back(*cell_state);
-    inputs_vector_info.emplace_back(*cell_state);
-
-    std::vector<ITensorInfo *> inputs_vector_info_raw;
-    for(auto &input : inputs_vector_info)
-    {
-        inputs_vector_info_raw.emplace_back(&input);
-    }
+    inputs_vector_info_raw.push_back(&cell_state_tmp);
+    inputs_vector_info_raw.push_back(&forget_gate);
+    inputs_vector_info_raw.push_back(&output_gate_tmp);
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayer::validate(inputs_vector_info_raw, scratch_buffer));
     return Status{};
@@ -428,14 +461,13 @@
     _memory_group.acquire();
 
     _fully_connected_forget_gate.run();
-    CLScheduler::get().enqueue(_transpose_forget_gate1);
-    _gemm_forget_gate1.run();
+    CLScheduler::get().enqueue(_transpose_forget_gate);
+    _gemm_forget_gate.run();
     CLScheduler::get().enqueue(_accum_forget_gate1);
 
     if(_run_peephole_opt)
     {
-        CLScheduler::get().enqueue(_transpose_forget_gate2);
-        _gemm_forget_gate2.run();
+        CLScheduler::get().enqueue(_pixelwise_mul_forget_gate);
         _accum_forget_gate2.run();
     }
     CLScheduler::get().enqueue(_activation_forget_gate);
@@ -443,24 +475,33 @@
     if(_run_cifg_opt)
     {
         _ones.map(true);
-        std::fill_n(_ones.buffer(), _ones.info()->total_size(), 1);
+        if(_ones.info()->data_type() == DataType::F16)
+        {
+            std::fill_n(reinterpret_cast<half *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
+        }
+        else
+        {
+            std::fill_n(reinterpret_cast<float *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
+        }
         _ones.unmap();
         CLScheduler::get().enqueue(_subtract_input_gate);
     }
     else
     {
         _fully_connected_input_gate.run();
-        CLScheduler::get().enqueue(_transpose_input_gate1);
-        _gemm_input_gate1.run();
-        CLScheduler::get().enqueue(_transpose_input_gate2);
-        _gemm_input_gate2.run();
+        CLScheduler::get().enqueue(_transpose_input_gate);
+        _gemm_input_gate.run();
         CLScheduler::get().enqueue(_accum_input_gate1);
-        _accum_input_gate2.run();
+        if(_run_peephole_opt)
+        {
+            CLScheduler::get().enqueue(_pixelwise_mul_input_gate);
+            _accum_input_gate2.run();
+        }
         CLScheduler::get().enqueue(_activation_input_gate);
     }
 
     _fully_connected_cell_state.run();
-    CLScheduler::get().enqueue(_transpose_cell_state1);
+    CLScheduler::get().enqueue(_transpose_cell_state);
     _gemm_cell_state1.run();
     CLScheduler::get().enqueue(_accum_cell_state1);
     CLScheduler::get().enqueue(_activation_cell_state);
@@ -474,21 +515,19 @@
     }
 
     _fully_connected_output.run();
-    CLScheduler::get().enqueue(_transpose_output1);
-    _gemm_output1.run();
+    CLScheduler::get().enqueue(_transpose_output);
+    _gemm_output.run();
     CLScheduler::get().enqueue(_accum_output1);
-    CLScheduler::get().enqueue(_pixelwise_mul_output_state);
 
     if(_run_peephole_opt)
     {
-        CLScheduler::get().enqueue(_transpose_output2);
-        _gemm_output2.run();
+        CLScheduler::get().enqueue(_pixelwise_mul_output_state1);
         _accum_output2.run();
     }
     CLScheduler::get().enqueue(_activation_output);
 
     CLScheduler::get().enqueue(_activation_output_state);
-    CLScheduler::get().enqueue(_pixelwise_mul_output_state);
+    CLScheduler::get().enqueue(_pixelwise_mul_output_state2);
 
     if(_has_projection_weights)
     {
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index 986fe00..40bf032 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -48,7 +48,10 @@
     // Get convolved dimensions
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0),
+                                                 input->dimension(1),
+                                                 kernel_width,
+                                                 kernel_height,
                                                  conv_info);
 
     const size_t mat_weights_cols = weights->dimension(3);
@@ -61,9 +64,12 @@
     const size_t mat_input_rows = conv_w * conv_h;
 
     shape_im2col = input->tensor_shape();
+    if(shape_im2col.num_dimensions() >= 3)
+    {
+        shape_im2col.remove_dimension(2);
+    }
     shape_im2col.set(0, mat_input_cols);
     shape_im2col.set(1, mat_input_rows);
-    shape_im2col.set(2, 1);
 
     shape_gemm = shape_im2col;
     shape_gemm.set(0, mat_weights_cols);
@@ -73,7 +79,7 @@
 
 CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
-      _is_first_run(false), _original_weights(nullptr)
+      _is_prepared(false), _original_weights(nullptr)
 {
 }
 
@@ -128,7 +134,7 @@
 
     bool _has_bias    = (biases != nullptr);
     _original_weights = weights;
-    _is_first_run     = true;
+    _is_prepared      = false;
 
     const unsigned int kernel_width  = weights->info()->dimension(0);
     const unsigned int kernel_height = weights->info()->dimension(1);
@@ -160,24 +166,15 @@
     _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
 
     // Allocate intermediate tensors
-    _weights_reshaped.allocator()->allocate();
     _input_im2col_reshaped.allocator()->allocate();
     _gemm_output.allocator()->allocate();
+
+    CLScheduler::get().tune_kernel_static(_input_im2col_kernel);
 }
 
 void CLLocallyConnectedLayer::run()
 {
-    // Run weights reshaping (Runs once for every configure)
-    if(_is_first_run)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        _is_first_run = false;
-        CLScheduler::get().enqueue(_weights_reshape_kernel);
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
-    }
+    prepare();
 
     _memory_group.acquire();
 
@@ -192,3 +189,19 @@
 
     _memory_group.release();
 }
+
+void CLLocallyConnectedLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run weights reshaping and mark original weights tensor as unused
+        _weights_reshaped.allocator()->allocate();
+        CLScheduler::get().enqueue(_weights_reshape_kernel);
+        _original_weights->mark_as_unused();
+
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
+}
diff --git a/src/runtime/CL/functions/CLMagnitude.cpp b/src/runtime/CL/functions/CLMagnitude.cpp
index b1284db..e2dfe3a 100644
--- a/src/runtime/CL/functions/CLMagnitude.cpp
+++ b/src/runtime/CL/functions/CLMagnitude.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,10 +30,8 @@
 
 using namespace arm_compute;
 
-void CLMagnitude::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type, bool use_fp16)
+void CLMagnitude::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type)
 {
-    ARM_COMPUTE_UNUSED(use_fp16);
-
     auto k = arm_compute::support::cpp14::make_unique<CLMagnitudePhaseKernel>();
     k->configure(input1, input2, output, nullptr, mag_type);
     _kernel = std::move(k);
diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp
index 838f7e7..157f306 100644
--- a/src/runtime/CL/functions/CLMeanStdDev.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDev.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,35 +21,149 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CL/functions/CLMeanStdDev.h"
+#include "arm_compute/core/TensorInfo.h"
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/functions/CLMeanStdDev.h"
 
 using namespace arm_compute;
 
-CLMeanStdDev::CLMeanStdDev()
-    : _mean_stddev_kernel(),
+CLMeanStdDev::CLMeanStdDev(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _data_type(),
+      _num_pixels(),
+      _run_stddev(),
+      _reduction_operation_mean(),
+      _reduction_operation_stddev(),
+      _reduction_output_mean(),
+      _reduction_output_stddev(),
+      _mean(nullptr),
+      _stddev(nullptr),
+      _mean_stddev_kernel(),
       _fill_border_kernel(),
       _global_sum(),
       _global_sum_squared()
 {
 }
 
+Status CLMeanStdDev::validate(ITensorInfo *input, float *mean, float *stddev)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_TENSOR_NOT_2D(input);
+    if(is_data_type_float(input->data_type()))
+    {
+        ARM_COMPUTE_UNUSED(mean);
+        ARM_COMPUTE_UNUSED(stddev);
+
+        TensorShape output_shape      = TensorShape{ 1, input->dimension(1) };
+        TensorInfo  output_shape_info = TensorInfo(output_shape, 1, DataType::U8);
+        return CLReductionOperation::validate(input, &output_shape_info, 0, ReductionOperation::SUM);
+    }
+    else
+    {
+        return CLMeanStdDevKernel::validate(input, mean, nullptr, stddev, nullptr);
+    }
+}
+
 void CLMeanStdDev::configure(ICLImage *input, float *mean, float *stddev)
 {
-    _global_sum = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
+    // In the case of F16/F32 we call reduction operation for calculating CLMeanStdDev
+    _data_type = input->info()->data_type();
 
-    if(stddev != nullptr)
+    if(is_data_type_float(_data_type))
     {
-        _global_sum_squared = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
+        _num_pixels = input->info()->dimension(0) * input->info()->dimension(1);
+
+        _memory_group.manage(&_reduction_output_mean);
+        _reduction_operation_mean.configure(input, &_reduction_output_mean, 0, ReductionOperation::SUM);
+        _reduction_output_mean.allocator()->allocate();
+        _mean = mean;
+
+        if(stddev != nullptr)
+        {
+            _memory_group.manage(&_reduction_output_stddev);
+            _reduction_operation_stddev.configure(input, &_reduction_output_stddev, 0, ReductionOperation::SUM_SQUARE);
+            _reduction_output_stddev.allocator()->allocate();
+            _stddev     = stddev;
+            _run_stddev = true;
+        }
+    }
+    else
+    {
+        _global_sum = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
+
+        if(stddev != nullptr)
+        {
+            _global_sum_squared = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
+        }
+
+        _mean_stddev_kernel.configure(input, mean, &_global_sum, stddev, &_global_sum_squared);
+        _fill_border_kernel.configure(input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
+    }
+}
+
+template <typename T>
+void CLMeanStdDev::run_float()
+{
+    _memory_group.acquire();
+
+    // Perform reduction on x-axis
+    _reduction_operation_mean.run();
+    if(_run_stddev)
+    {
+        _reduction_operation_stddev.run();
+        _reduction_output_stddev.map(true);
     }
 
-    _mean_stddev_kernel.configure(input, mean, &_global_sum, stddev, &_global_sum_squared);
-    _fill_border_kernel.configure(input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
+    _reduction_output_mean.map(true);
+
+    auto mean = static_cast<T>(0);
+
+    // Calculate final result for mean
+    for(unsigned int i = 0; i < _reduction_output_mean.info()->dimension(1); ++i)
+    {
+        mean += *reinterpret_cast<T *>(_reduction_output_mean.buffer() + _reduction_output_mean.info()->offset_element_in_bytes(Coordinates(0, i)));
+    }
+
+    mean /= _num_pixels;
+    *_mean = mean;
+
+    if(_run_stddev)
+    {
+        auto stddev = static_cast<T>(0);
+        // Calculate final result for stddev
+        for(unsigned int i = 0; i < _reduction_output_stddev.info()->dimension(1); ++i)
+        {
+            stddev += *reinterpret_cast<T *>(_reduction_output_stddev.buffer() + _reduction_output_stddev.info()->offset_element_in_bytes(Coordinates(0, i)));
+        }
+        *_stddev = std::sqrt((stddev / _num_pixels) - (mean * mean));
+
+        _reduction_output_stddev.unmap();
+    }
+    _reduction_output_mean.unmap();
+
+    _memory_group.release();
+}
+
+void CLMeanStdDev::run_int()
+{
+    CLScheduler::get().enqueue(_fill_border_kernel);
+    CLScheduler::get().enqueue(_mean_stddev_kernel);
 }
 
 void CLMeanStdDev::run()
 {
-    CLScheduler::get().enqueue(_fill_border_kernel);
-    CLScheduler::get().enqueue(_mean_stddev_kernel);
+    switch(_data_type)
+    {
+        case DataType::F16:
+            run_float<half>();
+            break;
+        case DataType::F32:
+            run_float<float>();
+            break;
+        case DataType::U8:
+            run_int();
+            break;
+        default:
+            ARM_COMPUTE_ERROR_ON("Not supported");
+    }
 }
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 17875a3..cbe1ce3 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -63,6 +63,9 @@
             ARM_COMPUTE_ERROR("Data layout not supported");
     }
     _border_handler.configure(input, _kernel->border_size(), border_mode, pixel_value);
+
+    // Tune kernels
+    CLScheduler::get().tune_kernel_static(*_kernel);
 }
 
 Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 4843ba6..1809e6e 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -36,7 +36,8 @@
 using namespace arm_compute::misc::shape_calculator;
 
 CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output()
+    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(),
+      _is_prepared(false)
 {
 }
 
@@ -57,7 +58,7 @@
 
     auto shape_info = TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info, true, false));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
     ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
     ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info));
@@ -74,12 +75,14 @@
     const int   idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
     TensorShape shape      = compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
 
+    _is_prepared = false;
+
     _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
     _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
 
     // Manage intermediate buffers and configure
     _memory_group.manage(&_fully_connected_out);
-    _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out, true, false);
+    _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
 
     _memory_group.manage(&_gemm_output);
     _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
@@ -100,7 +103,10 @@
 
 void CLRNNLayer::run()
 {
+    prepare();
+
     _memory_group.acquire();
+
     _fully_connected_kernel.run();
     _gemm_state_f.run();
     CLScheduler::get().enqueue(_add_kernel);
@@ -108,5 +114,17 @@
 
     // copy hidden out to output
     CLScheduler::get().enqueue(_copy_kernel);
+
     _memory_group.release();
+}
+
+void CLRNNLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        _fully_connected_kernel.prepare();
+        _gemm_state_f.prepare();
+
+        _is_prepared = true;
+    }
 }
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 3a5133d..2a171c3 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -71,7 +71,6 @@
         sums_vector[i].set_data_type(input->data_type());
         sums_vector[i].set_tensor_shape(shape);
         sums_vector[i].set_num_channels(input->num_channels());
-        sums_vector[i].set_fixed_point_position(input->fixed_point_position());
     }
 
     // Validate ReductionOperation only on first kernel
@@ -105,7 +104,7 @@
     for(unsigned int i = 0; i < _num_of_stages - 1; i++)
     {
         shape.set(0, ceil(shape.x() / 128.f));
-        _sums_vector[i].allocator()->init(TensorInfo(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+        _sums_vector[i].allocator()->init(TensorInfo(shape, input->info()->num_channels(), input->info()->data_type()));
     }
 
     // Apply ReductionOperation only on first kernel
diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index cb68481..4ff9763 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/kernels/CLScaleKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
@@ -34,7 +35,18 @@
 void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLScaleKernel>();
-    k->configure(input, output, policy, border_mode == BorderMode::UNDEFINED, sampling_policy);
+    k->set_target(CLScheduler::get().target());
+    k->configure(input, output, policy, border_mode, sampling_policy);
     _kernel = std::move(k);
+
+    // Tune kernels
+    CLScheduler::get().tune_kernel_static(*_kernel);
+
+    // In the case of NHWC we can't have undefined border mode as this would require to access elements outside z dimension,
+    // so we treat it like border constant.
+    if(border_mode == BorderMode::UNDEFINED && input->info()->data_layout() == DataLayout::NHWC)
+    {
+        border_mode = BorderMode::CONSTANT;
+    }
     _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
 }
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index a92fbce..7a20d9f 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -77,15 +77,16 @@
 Status CLSoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 2, "Only 2D inputs are supported");
 
     // Create intermediate tensor info
     DataType   tmp_data_type = is_data_type_quantized_asymmetric(input->data_type()) ? DataType::S32 : input->data_type();
-    TensorInfo tensor_info_tmp(input->clone()->set_data_type(tmp_data_type));
+    TensorInfo tensor_info_tmp(input->clone()->set_data_type(tmp_data_type).set_is_resizable(true));
 
     TensorShape max_sum_shape = input->tensor_shape();
     max_sum_shape.set(0, 1);
-    TensorInfo tensor_info_max(input->clone()->set_tensor_shape(max_sum_shape));
-    TensorInfo tensor_info_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()));
+    TensorInfo tensor_info_max(input->clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true));
+    TensorInfo tensor_info_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()).set_is_resizable(true));
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DMaxShiftExpSumKernel::validate(input, &tensor_info_max, &tensor_info_tmp, &tensor_info_sum));
     ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DNormKernel::validate(&tensor_info_tmp, &tensor_info_sum, output));
diff --git a/src/runtime/CL/functions/CLWarpAffine.cpp b/src/runtime/CL/functions/CLWarpAffine.cpp
index f785c75..4286cf6 100644
--- a/src/runtime/CL/functions/CLWarpAffine.cpp
+++ b/src/runtime/CL/functions/CLWarpAffine.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,7 +31,7 @@
 
 using namespace arm_compute;
 
-void CLWarpAffine::configure(ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+void CLWarpAffine::configure(ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLWarpAffineKernel>();
     k->configure(input, output, matrix, policy);
diff --git a/src/runtime/CL/functions/CLWarpPerspective.cpp b/src/runtime/CL/functions/CLWarpPerspective.cpp
index b445b3b..4603ee0 100644
--- a/src/runtime/CL/functions/CLWarpPerspective.cpp
+++ b/src/runtime/CL/functions/CLWarpPerspective.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,7 +31,7 @@
 
 using namespace arm_compute;
 
-void CLWarpPerspective::configure(ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+void CLWarpPerspective::configure(ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLWarpPerspectiveKernel>();
     k->configure(input, output, matrix, policy);
diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
index d542781..5233ff4 100644
--- a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
@@ -48,7 +48,7 @@
     // Output auto inizialitation if not yet initialized
     TensorInfo  tmp_output_info = *output->clone();
     TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
-    auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type(), inputs_vector[0]->fixed_point_position());
+    auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
 
     unsigned int width_offset = 0;
     for(const auto &input : inputs_vector)
@@ -73,7 +73,7 @@
     TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type(), inputs_vector[0]->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
     ARM_COMPUTE_ERROR_THROW_ON(CLWidthConcatenateLayer::validate(inputs_vector_info, output->info()));
 
     unsigned int width_offset = 0;
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 49753ad..a70389a 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -33,17 +33,34 @@
 
 namespace
 {
-Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims)
+Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, DataLayout data_layout)
 {
     Size2D output_tile = Size2D{};
 
-    if(kernel_dims == Size2D(3U, 3U))
+    const unsigned int kernel_max_dim = std::max(kernel_dims.width, kernel_dims.height);
+
+    // Check if the input spatial dimensions are smaller than 4
+    const bool is_input_lt4_nchw = (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW);
+
+    if(kernel_max_dim == 3U)
     {
-        output_tile = (input_dims.width <= 4 && input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
+        if(kernel_dims == Size2D(3U, 3U))
+        {
+            output_tile = is_input_lt4_nchw ? Size2D(2U, 2U) : Size2D(4U, 4U);
+        }
+        else if(kernel_dims == Size2D(3U, 1U))
+        {
+            output_tile = is_input_lt4_nchw ? Size2D(2U, 1U) : Size2D(4U, 1U);
+        }
+        else
+        {
+            output_tile = is_input_lt4_nchw ? Size2D(1U, 2U) : Size2D(1U, 4U);
+        }
     }
-    else if(kernel_dims == Size2D(5U, 5U))
+    else if(kernel_max_dim == 5U)
     {
-        output_tile = Size2D(4U, 4U);
+        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U,
+                             kernel_dims.height == 1 ? 1U : 4U);
     }
 
     return output_tile;
@@ -82,7 +99,7 @@
     // Input shape, kernel size and output tile
     const Size2D input_dims  = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
     const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
-    const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
+    const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, input->info()->data_layout());
 
     // Check if the Winograd configuration requires fast math
     if(!enable_fast_math)
@@ -139,7 +156,7 @@
     // Input shape, kernel size and output tile
     const Size2D input_dims  = Size2D(input->tensor_shape()[idx_width], input->tensor_shape()[idx_height]);
     const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
-    const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
+    const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, input->data_layout());
 
     // Check if the Winograd configuration requires fast math
     if(!enable_fast_math)
diff --git a/src/runtime/CL/tuners/BifrostTuner.cpp b/src/runtime/CL/tuners/BifrostTuner.cpp
index c0ebd24..2d52f33 100644
--- a/src/runtime/CL/tuners/BifrostTuner.cpp
+++ b/src/runtime/CL/tuners/BifrostTuner.cpp
@@ -124,15 +124,195 @@
         k.set_lws_hint(lws_hint);
     }
 }
+
+void tune_col2im_kernel(CLCol2ImKernel &k)
+{
+    cl::NDRange     lws_hint   = k.lws_hint();
+    const GPUTarget gpu_target = k.get_target();
+
+    // Configure the local work size for Bifrost with a value obtained
+    // via exhaustive autotuning over 30 representative tensor shapes.
+    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76))
+    {
+        if((k._convolved_dims.first == 7) || (k._convolved_dims.first == 14))
+        {
+            lws_hint = cl::NDRange(1, 7, 1);
+        }
+        else
+        {
+            lws_hint = cl::NDRange(1, 8, 1);
+        }
+    }
+
+    k.set_lws_hint(lws_hint);
+}
+
+void tune_im2col_kernel(CLIm2ColKernel &k)
+{
+    cl::NDRange     lws_hint   = k.lws_hint();
+    const GPUTarget gpu_target = k.get_target();
+
+    // Local work size optimized for the 11x11 AlexNet convolution on Bifrost.
+    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76) && k._kernel_dims.width == 11)
+    {
+        const bool is_square_kernel = (k._kernel_dims.width == k._kernel_dims.height);
+        if(!is_square_kernel && k._kernel_dims.width > 1 && !k._conv_info.has_padding())
+        {
+            lws_hint = cl::NDRange(1, 1, 1);
+        }
+    }
+    k.set_lws_hint(lws_hint);
+}
+
+void tune_depthwise_im2col_kernel(CLDepthwiseIm2ColKernel &k)
+{
+    cl::NDRange     lws_hint   = k.lws_hint();
+    const GPUTarget gpu_target = k.get_target();
+
+    // Configure the local work size for Bifrost with a value obtained
+    // via exhaustive autotuning for the MobileNets tensor shapes.
+    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76))
+    {
+        lws_hint = cl::NDRange(1, 2, 1);
+    }
+
+    k.set_lws_hint(lws_hint);
+}
+
+void tune_gemv_kernel(CLGEMMMatrixVectorMultiplyKernel &k)
+{
+    cl::NDRange     lws_hint   = k.lws_hint();
+    const GPUTarget gpu_target = k.get_target();
+
+    // Configure the local work size for Bifrost with a value obtained
+    // via exhaustive autotuning for the MobileNets tensor shapes.
+    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76))
+    {
+        lws_hint = cl::NDRange(1, 1, 1);
+    }
+
+    k.set_lws_hint(lws_hint);
+}
+
+void tune_gemm_kernel(CLGEMMMatrixMultiplyKernel &k)
+{
+    cl::NDRange     lws_hint   = k.lws_hint();
+    const GPUTarget gpu_target = k.get_target();
+
+    // Configure LWS hint
+    switch(gpu_target)
+    {
+        case GPUTarget::G71:
+        case GPUTarget::G72:
+        case GPUTarget::G51:
+        case GPUTarget::G51BIG:
+        case GPUTarget::G51LIT:
+        case GPUTarget::G76:
+            if(k._input1->info()->dimension(1) == 24)
+            {
+                // LWS optimized for the 11x11 AlexNet convolution on Bifrost.
+                lws_hint = cl::NDRange(2, 2);
+            }
+            else if(k._output->info()->dimension(1) == 196)
+            {
+                lws_hint = cl::NDRange(1, 7);
+            }
+            else
+            {
+                lws_hint = cl::NDRange(8, 8);
+            }
+            break;
+        default:
+            lws_hint = cl::NullRange;
+    }
+
+    k.set_lws_hint(lws_hint);
+}
+
+void tune_pooling_kernel(CLPoolingLayerKernel &k)
+{
+    cl::NDRange     lws_hint   = k.lws_hint();
+    const GPUTarget gpu_target = k.get_target();
+
+    // Configure the local work size (hint) from the first two dimensions of the global work size.
+    // On Bifrost, this works for up to 35x35xC filters, for which the pooling_layer_3_optimized
+    // kernel is launched with gws=(9, 33, C). In any case, the hint will be ignored if it is
+    // invalid (e.g. exceeds the maximum workgroup size that the kernel can be launched with).
+    if(k._input->info()->data_layout() == DataLayout::NCHW)
+    {
+        if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76))
+        {
+            cl::NDRange gws = ICLKernel::gws_from_window(k.window());
+            lws_hint        = cl::NDRange(gws[0], gws[1], 1);
+        }
+    }
+
+    k.set_lws_hint(lws_hint);
+}
+
+void tune_scale_kernel(CLScaleKernel &k)
+{
+    cl::NDRange               lws_hint      = k.lws_hint();
+    const GPUTarget           gpu_target    = k.get_target();
+    const DataType            dt            = k.input()->info()->data_type();
+    const InterpolationPolicy interpolation = k._interpolationPolicy;
+
+    // Configure the local work size for Bifrost, interpolation (bilinear) and datatype F32.
+    // The value are obtained via exhaustive autotuning.
+    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72) && (dt == DataType::F32) && (interpolation == InterpolationPolicy::BILINEAR))
+    {
+        auto dim_0 = k.output()->info()->dimension(0);
+        if(dim_0 == 480)
+        {
+            lws_hint = cl::NDRange(2, 1);
+        }
+        else if(dim_0 == 3120)
+        {
+            lws_hint = cl::NDRange(2, 8);
+        }
+        else if(dim_0 == 4160)
+        {
+            lws_hint = cl::NDRange(4, 8);
+        }
+        k.set_lws_hint(lws_hint);
+    }
+}
 } // namespace
 
 void BifrostTuner::tune_kernel_static(ICLKernel &kernel)
 {
-    // Continue on tuning if dynamic tuning
     if(dynamic_cast<CLDirectConvolutionLayerKernel *>(&kernel) != nullptr)
     {
         tune_direct_convolution_kernel(*utils::cast::polymorphic_downcast<CLDirectConvolutionLayerKernel *>(&kernel));
     }
+    else if(dynamic_cast<CLCol2ImKernel *>(&kernel) != nullptr)
+    {
+        tune_col2im_kernel(*utils::cast::polymorphic_downcast<CLCol2ImKernel *>(&kernel));
+    }
+    else if(dynamic_cast<CLIm2ColKernel *>(&kernel) != nullptr)
+    {
+        tune_im2col_kernel(*utils::cast::polymorphic_downcast<CLIm2ColKernel *>(&kernel));
+    }
+    else if(dynamic_cast<CLDepthwiseIm2ColKernel *>(&kernel) != nullptr)
+    {
+        tune_depthwise_im2col_kernel(*utils::cast::polymorphic_downcast<CLDepthwiseIm2ColKernel *>(&kernel));
+    }
+    else if(dynamic_cast<CLGEMMMatrixVectorMultiplyKernel *>(&kernel) != nullptr)
+    {
+        tune_gemv_kernel(*utils::cast::polymorphic_downcast<CLGEMMMatrixVectorMultiplyKernel *>(&kernel));
+    }
+    else if(dynamic_cast<CLGEMMMatrixMultiplyKernel *>(&kernel) != nullptr)
+    {
+        tune_gemm_kernel(*utils::cast::polymorphic_downcast<CLGEMMMatrixMultiplyKernel *>(&kernel));
+    }
+    else if(dynamic_cast<CLPoolingLayerKernel *>(&kernel) != nullptr)
+    {
+        tune_pooling_kernel(*utils::cast::polymorphic_downcast<CLPoolingLayerKernel *>(&kernel));
+    }
+    else if(dynamic_cast<CLScaleKernel *>(&kernel) != nullptr)
+    {
+        tune_scale_kernel(*utils::cast::polymorphic_downcast<CLScaleKernel *>(&kernel));
+    }
 }
 
 void BifrostTuner::tune_kernel_dynamic(ICLKernel &kernel)
diff --git a/src/runtime/CL/tuners/MidgardTuner.cpp b/src/runtime/CL/tuners/MidgardTuner.cpp
new file mode 100644
index 0000000..cae3123
--- /dev/null
+++ b/src/runtime/CL/tuners/MidgardTuner.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/tuners/MidgardTuner.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernels.h"
+#include "arm_compute/core/utils/misc/Cast.h"
+
+namespace arm_compute
+{
+namespace tuners
+{
+namespace
+{
+void tune_gemm_kernel(CLGEMMMatrixMultiplyKernel &k)
+{
+    cl::NDRange     lws_hint   = k.lws_hint();
+    const GPUTarget gpu_target = k.get_target();
+
+    switch(gpu_target)
+    {
+        case GPUTarget::MIDGARD:
+        case GPUTarget::T600:
+        case GPUTarget::T700:
+        case GPUTarget::T800:
+            if(k._output->info()->dimension(1) == 196)
+            {
+                lws_hint = cl::NDRange(1, 7);
+            }
+            else
+            {
+                lws_hint = cl::NDRange(8, 8);
+            }
+            break;
+        default:
+            lws_hint = cl::NullRange;
+    }
+
+    k.set_lws_hint(lws_hint);
+}
+} // namespace
+
+void MidgardTuner::tune_kernel_static(ICLKernel &kernel)
+{
+    if(dynamic_cast<CLGEMMMatrixMultiplyKernel *>(&kernel) != nullptr)
+    {
+        tune_gemm_kernel(*utils::cast::polymorphic_downcast<CLGEMMMatrixMultiplyKernel *>(&kernel));
+    }
+}
+
+void MidgardTuner::tune_kernel_dynamic(ICLKernel &kernel)
+{
+    ARM_COMPUTE_UNUSED(kernel);
+}
+} // namespace tuners
+} // namespace arm_compute
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 92dce34..de28b4f 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/CPUUtils.h"
 
+#include <atomic>
 #include <condition_variable>
 #include <iostream>
 #include <mutex>
@@ -37,7 +38,59 @@
 
 namespace arm_compute
 {
-class Thread
+namespace
+{
+class ThreadFeeder
+{
+public:
+    /** Constructor
+     *
+     * @param[in] start First value that will be returned by the feeder
+     * @param[in] end   End condition (The last value returned by get_next() will be end - 1)
+     */
+    explicit ThreadFeeder(unsigned int start = 0, unsigned int end = 0)
+        : _atomic_counter(start), _end(end)
+    {
+    }
+    /** Return the next element in the range if there is one.
+     *
+     * @param[out] next Will contain the next element if there is one.
+     *
+     * @return False if the end of the range has been reached and next wasn't set.
+     */
+    bool get_next(unsigned int &next)
+    {
+        next = atomic_fetch_add_explicit(&_atomic_counter, 1u, std::memory_order_relaxed);
+        return next < _end;
+    }
+
+private:
+    std::atomic_uint   _atomic_counter;
+    const unsigned int _end;
+};
+
+/** Execute workloads[info.thread_id] first, then call the feeder to get the index of the next workload to run.
+ *
+ * Will run workloads until the feeder reaches the end of its range.
+ *
+ * @param[in]     workloads The array of workloads
+ * @param[in,out] feeder    The feeder indicating which workload to execute next.
+ * @param[in]     info      Threading and CPU info.
+ */
+void process_workloads(std::vector<IScheduler::Workload> &workloads, ThreadFeeder &feeder, const ThreadInfo &info)
+{
+    unsigned int workload_index = info.thread_id;
+    do
+    {
+        ARM_COMPUTE_ERROR_ON(workload_index >= workloads.size());
+        workloads[workload_index](info);
+    }
+    while(feeder.get_next(workload_index));
+}
+
+} //namespace
+
+class CPPScheduler::Thread
 {
 public:
     /** Start a new thread. */
@@ -51,11 +104,15 @@
     /** Destructor. Make the thread join. */
     ~Thread();
 
-    /** Request the worker thread to start executing the given kernel
-     * This function will return as soon as the kernel has been sent to the worker thread.
+    /** Request the worker thread to start executing workloads.
+     *
+     * The thread will start by executing workloads[info.thread_id] and will then call the feeder to
+     * get the index of the following workload to run.
+     *
+     * @note This function will return as soon as the workloads have been sent to the worker thread.
      * wait() needs to be called to ensure the execution is complete.
      */
-    void start(ICPPKernel *kernel, const Window &window, const ThreadInfo &info);
+    void start(std::vector<IScheduler::Workload> *workloads, ThreadFeeder &feeder, const ThreadInfo &info);
 
     /** Wait for the current kernel execution to complete. */
     void wait();
@@ -64,39 +121,38 @@
     void worker_thread();
 
 private:
-    std::thread             _thread;
-    ICPPKernel             *_kernel{ nullptr };
-    Window                  _window;
-    ThreadInfo              _info;
-    std::mutex              _m;
-    std::condition_variable _cv;
-    bool                    _wait_for_work{ false };
-    bool                    _job_complete{ true };
-    std::exception_ptr      _current_exception;
+    std::thread                        _thread{};
+    ThreadInfo                         _info{};
+    std::vector<IScheduler::Workload> *_workloads{ nullptr };
+    ThreadFeeder                      *_feeder{ nullptr };
+    std::mutex                         _m{};
+    std::condition_variable            _cv{};
+    bool                               _wait_for_work{ false };
+    bool                               _job_complete{ true };
+    std::exception_ptr                 _current_exception{ nullptr };
 };
 
-Thread::Thread()
-    : _thread(), _window(), _info(), _m(), _cv(), _current_exception(nullptr)
+CPPScheduler::Thread::Thread()
 {
     _thread = std::thread(&Thread::worker_thread, this);
 }
 
-Thread::~Thread()
+CPPScheduler::Thread::~Thread()
 {
     // Make sure worker thread has ended
     if(_thread.joinable())
     {
-        start(nullptr, Window(), ThreadInfo());
+        ThreadFeeder feeder;
+        start(nullptr, feeder, ThreadInfo());
         _thread.join();
     }
 }
 
-void Thread::start(ICPPKernel *kernel, const Window &window, const ThreadInfo &info)
+void CPPScheduler::Thread::start(std::vector<IScheduler::Workload> *workloads, ThreadFeeder &feeder, const ThreadInfo &info)
 {
-    _kernel = kernel;
-    _window = window;
-    _info   = info;
-
+    _workloads = workloads;
+    _feeder    = &feeder;
+    _info      = info;
     {
         std::lock_guard<std::mutex> lock(_m);
         _wait_for_work = true;
@@ -105,7 +161,7 @@
     _cv.notify_one();
 }
 
-void Thread::wait()
+void CPPScheduler::Thread::wait()
 {
     {
         std::unique_lock<std::mutex> lock(_m);
@@ -118,7 +174,7 @@
     }
 }
 
-void Thread::worker_thread()
+void CPPScheduler::Thread::worker_thread()
 {
     while(true)
     {
@@ -129,15 +185,14 @@
         _current_exception = nullptr;
 
         // Time to exit
-        if(_kernel == nullptr)
+        if(_workloads == nullptr)
         {
             return;
         }
 
         try
         {
-            _window.validate();
-            _kernel->run(_window, _info);
+            process_workloads(*_workloads, *_feeder, _info);
         }
         catch(...)
         {
@@ -174,56 +229,90 @@
     return _num_threads;
 }
 
-void CPPScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
+void CPPScheduler::run_workloads(std::vector<IScheduler::Workload> &workloads)
+{
+    const unsigned int num_threads = std::min(_num_threads, static_cast<unsigned int>(workloads.size()));
+    if(num_threads < 1)
+    {
+        return;
+    }
+    ThreadFeeder feeder(num_threads, workloads.size());
+    ThreadInfo   info;
+    info.cpu_info          = &_cpu_info;
+    info.num_threads       = num_threads;
+    unsigned int t         = 0;
+    auto         thread_it = _threads.begin();
+    for(; t < num_threads - 1; ++t, ++thread_it)
+    {
+        info.thread_id = t;
+        thread_it->start(&workloads, feeder, info);
+    }
+
+    info.thread_id = t;
+    process_workloads(workloads, feeder, info);
+
+    try
+    {
+        for(auto &thread : _threads)
+        {
+            thread.wait();
+        }
+    }
+    catch(const std::system_error &e)
+    {
+        std::cerr << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
+    }
+}
+
+void CPPScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
 
-    /** [Scheduler example] */
-    ThreadInfo info;
-    info.cpu_info = &_cpu_info;
-
     const Window      &max_window     = kernel->window();
-    const unsigned int num_iterations = max_window.num_iterations(split_dimension);
-    info.num_threads                  = std::min(num_iterations, _num_threads);
+    const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
+    const unsigned int num_threads    = std::min(num_iterations, _num_threads);
 
     if(num_iterations == 0)
     {
         return;
     }
 
-    if(!kernel->is_parallelisable() || info.num_threads == 1)
+    if(!kernel->is_parallelisable() || num_threads == 1)
     {
+        ThreadInfo info;
+        info.cpu_info = &_cpu_info;
         kernel->run(max_window, info);
     }
     else
     {
-        int  t         = 0;
-        auto thread_it = _threads.begin();
-
-        for(; t < info.num_threads - 1; ++t, ++thread_it)
+        unsigned int num_windows = 0;
+        switch(hints.strategy())
         {
-            Window win     = max_window.split_window(split_dimension, t, info.num_threads);
-            info.thread_id = t;
-            thread_it->start(kernel, win, info);
-        }
-
-        // Run last part on main thread
-        Window win     = max_window.split_window(split_dimension, t, info.num_threads);
-        info.thread_id = t;
-        kernel->run(win, info);
-
-        try
-        {
-            for(auto &thread : _threads)
+            case StrategyHint::STATIC:
+                num_windows = num_threads;
+                break;
+            case StrategyHint::DYNAMIC:
             {
-                thread.wait();
+                // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder
+                const unsigned int max_iterations = static_cast<unsigned int>(_num_threads) * 3;
+                num_windows                       = num_iterations > max_iterations ? max_iterations : num_iterations;
+                break;
             }
+            default:
+                ARM_COMPUTE_ERROR("Unknown strategy");
         }
-        catch(const std::system_error &e)
+        std::vector<IScheduler::Workload> workloads(num_windows);
+        for(unsigned int t = 0; t < num_windows; t++)
         {
-            std::cerr << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
+            //Capture 't' by copy, all the other variables by reference:
+            workloads[t] = [t, &hints, &max_window, &num_windows, &kernel](const ThreadInfo & info)
+            {
+                Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
+                win.validate();
+                kernel->run(win, info);
+            };
         }
+        run_workloads(workloads);
     }
-    /** [Scheduler example] */
 }
 } // namespace arm_compute
diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
index 2adc14c..3701159 100644
--- a/src/runtime/CPP/SingleThreadScheduler.cpp
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp
@@ -41,14 +41,23 @@
     ARM_COMPUTE_ERROR_ON(num_threads != 1);
 }
 
-void SingleThreadScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
+void SingleThreadScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
 {
-    ARM_COMPUTE_UNUSED(split_dimension);
+    ARM_COMPUTE_UNUSED(hints);
     ThreadInfo info;
     info.cpu_info = &_cpu_info;
     kernel->run(kernel->window(), info);
 }
 
+void SingleThreadScheduler::run_workloads(std::vector<Workload> &workloads)
+{
+    ThreadInfo info;
+    info.cpu_info = &_cpu_info;
+    for(auto &wl : workloads)
+    {
+        wl(info);
+    }
+}
 unsigned int SingleThreadScheduler::num_threads() const
 {
     return 1;
diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp
index 7e8bf2b..6c21086 100644
--- a/src/runtime/CPUUtils.cpp
+++ b/src/runtime/CPUUtils.cpp
@@ -69,49 +69,83 @@
 using namespace arm_compute;
 
 #if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
-struct PerCPUData
-{
-    CPUModel     model     = CPUModel::GENERIC;
-    unsigned int midr      = 0;
-    bool         model_set = false;
-};
 
+bool model_supports_dot(CPUModel model)
+{
+    switch(model)
+    {
+        case CPUModel::GENERIC_FP16_DOT:
+        case CPUModel::A55r1:
+            return true;
+        default:
+            return false;
+    }
+}
+
+bool model_supports_fp16(CPUModel model)
+{
+    switch(model)
+    {
+        case CPUModel::GENERIC_FP16:
+        case CPUModel::GENERIC_FP16_DOT:
+        case CPUModel::A55r1:
+            return true;
+        default:
+            return false;
+    }
+}
 /* Convert an MIDR register value to a CPUModel enum value. */
 CPUModel midr_to_model(const unsigned int midr)
 {
-    CPUModel model;
+    CPUModel model = CPUModel::GENERIC;
 
     // Unpack variant and CPU ID
-    const int variant = (midr >> 20) & 0xF;
-    const int cpunum  = (midr >> 4) & 0xFFF;
+    const int implementer = (midr >> 24) & 0xFF;
+    const int variant     = (midr >> 20) & 0xF;
+    const int cpunum      = (midr >> 4) & 0xFFF;
 
-    // Only CPUs we have code paths for are detected.  All other CPUs can be safely classed as "GENERIC"
-    switch(cpunum)
+    if(implementer == 0x41) // Arm CPUs
     {
-        case 0xd03:
-            model = CPUModel::A53;
-            break;
-
-        case 0xd05:
-            if(variant != 0)
-            {
-                model = CPUModel::A55r1;
-            }
-            else
-            {
-                model = CPUModel::A55r0;
-            }
-            break;
-
-        default:
-            model = CPUModel::GENERIC;
-            break;
+        // Only CPUs we have code paths for are detected.  All other CPUs can be safely classed as "GENERIC"
+        switch(cpunum)
+        {
+            case 0xd03: // A53
+            case 0xd04: // A35
+                model = CPUModel::A53;
+                break;
+            case 0xd05: // A55
+                if(variant != 0)
+                {
+                    model = CPUModel::A55r1;
+                }
+                else
+                {
+                    model = CPUModel::A55r0;
+                }
+                break;
+            case 0xd0a: // A75
+                if(variant != 0)
+                {
+                    model = CPUModel::GENERIC_FP16_DOT;
+                }
+                else
+                {
+                    model = CPUModel::GENERIC_FP16;
+                }
+                break;
+            case 0xd0b: // A76
+                model = CPUModel::GENERIC_FP16_DOT;
+                break;
+            default:
+                model = CPUModel::GENERIC;
+                break;
+        }
     }
 
     return model;
 }
 
-void populate_models_cpuid(std::vector<PerCPUData> &cpusv)
+void populate_models_cpuid(std::vector<CPUModel> &cpusv)
 {
     // If the CPUID capability is present, MIDR information is provided in /sys. Use that to populate the CPU model table.
     uint32_t i = 0;
@@ -126,16 +160,14 @@
             std::string line;
             if(bool(getline(file, line)))
             {
-                const unsigned long midr = support::cpp11::stoul(line, nullptr, 16);
-                c.midr                   = (midr & 0xffffffff);
-                c.model                  = midr_to_model(c.midr);
-                c.model_set              = true;
+                const unsigned long midr = support::cpp11::stoul(line, nullptr, support::cpp11::NumericBase::BASE_16);
+                c                        = midr_to_model(midr & 0xffffffff);
             }
         }
     }
 }
 
-void populate_models_cpuinfo(std::vector<PerCPUData> &cpusv)
+void populate_models_cpuinfo(std::vector<CPUModel> &cpusv)
 {
     // If "long-form" cpuinfo is present, parse that to populate models.
     std::regex proc_regex("^processor.*(\\d+)$");
@@ -160,7 +192,7 @@
             if(std::regex_match(line, match, proc_regex))
             {
                 std::string id     = match[1];
-                int         newcpu = support::cpp11::stoi(id, nullptr, 0);
+                int         newcpu = support::cpp11::stoi(id, nullptr);
 
                 if(curcpu >= 0 && midr == 0)
                 {
@@ -170,9 +202,7 @@
 
                 if(curcpu >= 0)
                 {
-                    cpusv[curcpu].midr      = midr;
-                    cpusv[curcpu].model     = midr_to_model(midr);
-                    cpusv[curcpu].model_set = true;
+                    cpusv[curcpu] = midr_to_model(midr);
                 }
 
                 midr   = 0;
@@ -183,28 +213,28 @@
 
             if(std::regex_match(line, match, imp_regex))
             {
-                int impv = support::cpp11::stoi(match[1], nullptr, 16);
+                int impv = support::cpp11::stoi(match[1], nullptr, support::cpp11::NumericBase::BASE_16);
                 midr |= (impv << 24);
                 continue;
             }
 
             if(std::regex_match(line, match, var_regex))
             {
-                int varv = support::cpp11::stoi(match[1], nullptr, 16);
-                midr |= (varv << 16);
+                int varv = support::cpp11::stoi(match[1], nullptr, support::cpp11::NumericBase::BASE_16);
+                midr |= (varv << 20);
                 continue;
             }
 
             if(std::regex_match(line, match, part_regex))
             {
-                int partv = support::cpp11::stoi(match[1], nullptr, 16);
+                int partv = support::cpp11::stoi(match[1], nullptr, support::cpp11::NumericBase::BASE_16);
                 midr |= (partv << 4);
                 continue;
             }
 
             if(std::regex_match(line, match, rev_regex))
             {
-                int regv = support::cpp11::stoi(match[1], nullptr, 10);
+                int regv = support::cpp11::stoi(match[1], nullptr);
                 midr |= (regv);
                 midr |= (0xf << 16);
                 continue;
@@ -213,9 +243,7 @@
 
         if(curcpu >= 0)
         {
-            cpusv[curcpu].midr      = midr;
-            cpusv[curcpu].model     = midr_to_model(midr);
-            cpusv[curcpu].model_set = true;
+            cpusv[curcpu] = midr_to_model(midr);
         }
     }
 }
@@ -251,7 +279,7 @@
 
             line.erase(line.begin(), startfrom);
 
-            max_cpus = support::cpp11::stoi(line, nullptr, 0) + 1;
+            max_cpus = support::cpp11::stoi(line, nullptr) + 1;
             success  = true;
         }
     }
@@ -262,7 +290,6 @@
         max_cpus = std::thread::hardware_concurrency();
     }
 #endif /* BARE_METAL */
-
     return max_cpus;
 }
 #endif /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
@@ -274,9 +301,9 @@
 void get_cpu_configuration(CPUInfo &cpuinfo)
 {
 #if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
-    bool cpuid        = false;
-    bool fp16_support = false;
-    bool dot_support  = false;
+    bool cpuid               = false;
+    bool hwcaps_fp16_support = false;
+    bool hwcaps_dot_support  = false;
 
     const uint32_t hwcaps = getauxval(AT_HWCAP);
 
@@ -287,54 +314,17 @@
 
     if((hwcaps & HWCAP_ASIMDHP) != 0)
     {
-        fp16_support = true;
+        hwcaps_fp16_support = true;
     }
 
     if((hwcaps & HWCAP_ASIMDDP) != 0)
     {
-        dot_support = true;
+        hwcaps_dot_support = true;
     }
 
-#ifdef __aarch64__
-    /* Pre-4.15 kernels don't have the ASIMDDP bit.
-     *
-     * Although the CPUID bit allows us to read the feature register
-     * directly, the kernel quite sensibly masks this to only show
-     * features known by it to be safe to show to userspace.  As a
-     * result, pre-4.15 kernels won't show the relevant bit in the
-     * feature registers either.
-     *
-     * So for now, use a whitelist of CPUs known to support the feature.
-     */
-    if(!dot_support && cpuid)
-    {
-        /* List of CPUs with dot product support:         A55r1       A75r1       A75r2  */
-        const unsigned int dotprod_whitelist_masks[]  = { 0xfff0fff0, 0xfff0fff0, 0xfff0fff0, 0 };
-        const unsigned int dotprod_whitelist_values[] = { 0x4110d050, 0x4110d0a0, 0x4120d0a0, 0 };
-
-        unsigned long cpuid;
-
-        __asm __volatile(
-            "mrs %0, midr_el1\n"
-            : "=r"(cpuid)
-            :
-            : );
-
-        for(int i = 0; dotprod_whitelist_values[i] != 0; i++)
-        {
-            if((cpuid & dotprod_whitelist_masks[i]) == dotprod_whitelist_values[i])
-            {
-                dot_support = true;
-                break;
-            }
-        }
-    }
-#endif /* __aarch64__ */
     const unsigned int max_cpus = get_max_cpus();
     cpuinfo.set_cpu_num(max_cpus);
-    cpuinfo.set_fp16(fp16_support);
-    cpuinfo.set_dotprod(dot_support);
-    std::vector<PerCPUData> percpu(max_cpus);
+    std::vector<CPUModel> percpu(max_cpus, CPUModel::GENERIC);
     if(cpuid)
     {
         populate_models_cpuid(percpu);
@@ -344,10 +334,17 @@
         populate_models_cpuinfo(percpu);
     }
     int j(0);
+    // Update dot product and FP16 support if all CPUs support these features:
+    bool all_support_dot  = true;
+    bool all_support_fp16 = true;
     for(const auto &v : percpu)
     {
-        cpuinfo.set_cpu_model(j++, v.model);
+        all_support_dot &= model_supports_dot(v);
+        all_support_fp16 &= model_supports_fp16(v);
+        cpuinfo.set_cpu_model(j++, v);
     }
+    cpuinfo.set_dotprod(all_support_dot || hwcaps_dot_support);
+    cpuinfo.set_fp16(all_support_fp16 || hwcaps_fp16_support);
 #else  /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
     ARM_COMPUTE_UNUSED(cpuinfo);
 #endif /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
index 2a710f7..a7a56b6 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
@@ -37,7 +37,7 @@
 using namespace arm_compute;
 
 GCConvolutionLayerReshapeWeights::GCConvolutionLayerReshapeWeights()
-    : _weights_reshape_kernel(), _weights_reshaped()
+    : _weights_reshape_kernel()
 {
 }
 
@@ -68,7 +68,7 @@
 
 GCConvolutionLayer::GCConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _reshape_weights(), _input_im2col_kernel(), _mm_gemm(), _output_col2im_kernel(), _fill_border(), _activationlayer_function(), _original_weights(nullptr),
-      _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _is_first_run(true), _is_activationlayer_enabled(false)
+      _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _is_activationlayer_enabled(false), _is_prepared(false)
 {
 }
 
@@ -88,7 +88,7 @@
 }
 
 void GCConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info)
+                                   const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
@@ -96,8 +96,10 @@
     ARM_COMPUTE_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
     ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+    ARM_COMPUTE_ERROR_ON(num_groups > 1);
+    ARM_COMPUTE_UNUSED(num_groups);
 
-    _is_first_run     = true;
+    _is_prepared      = false;
     _original_weights = weights;
 
     if(biases != nullptr)
@@ -148,7 +150,7 @@
     shape_im2col.set(1, mat_input_rows);
     shape_im2col.set(2, 1);
 
-    TensorInfo im2col_reshaped_info(shape_im2col, 1, dt, input->info()->fixed_point_position());
+    TensorInfo im2col_reshaped_info(shape_im2col, 1, dt);
     _input_im2col_reshaped.allocator()->init(im2col_reshaped_info);
     _memory_group.manage(&_input_im2col_reshaped);
 
@@ -158,7 +160,7 @@
     shape_gemm.set(1, mat_input_rows);
     const DataType gemm_data_type = dt;
 
-    TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->info()->fixed_point_position());
+    TensorInfo info_gemm(shape_gemm, 1, gemm_data_type);
     _gemm_output.allocator()->init(info_gemm);
     _memory_group.manage(&_gemm_output);
 
@@ -182,9 +184,6 @@
 
     ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
 
-    // Allocate intermediate tensor
-    _weights_reshaped.allocator()->allocate();
-
     //Configure Activation Layer
     _is_activationlayer_enabled = act_info.enabled();
 
@@ -198,17 +197,7 @@
 
 void GCConvolutionLayer::run()
 {
-    // Run weights reshaping (Runs once for every configure)
-    if(_is_first_run)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        _reshape_weights.run();
-        _is_first_run = false;
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
-    }
+    prepare();
 
     _memory_group.acquire();
 
@@ -219,17 +208,34 @@
 
     // Run gemm on reshaped matrices
     _mm_gemm.run();
-
     GCScheduler::get().memory_barrier();
+
     // Reshape output matrix
     GCScheduler::get().dispatch(_output_col2im_kernel, false);
+    GCScheduler::get().memory_barrier();
 
     _memory_group.release();
 
-    GCScheduler::get().memory_barrier();
     // Run Activation Layer
     if(_is_activationlayer_enabled)
     {
         _activationlayer_function.run();
     }
 }
+
+void GCConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run weights reshaping and mark as unused
+        _weights_reshaped.allocator()->allocate();
+        _reshape_weights.run();
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
+
+        _is_prepared = true;
+    }
+}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
index a300033..6b8e341 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
@@ -40,7 +40,7 @@
 
 GCFullyConnectedLayer::GCFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _reshape_weights_output(),
-      _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false)
+      _original_weights(nullptr), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false)
 {
 }
 
@@ -80,13 +80,14 @@
 }
 
 void GCFullyConnectedLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output,
-                                      bool transpose_weights, bool are_weights_reshaped, bool retain_internal_weights)
+                                      FullyConnectedLayerInfo fc_info)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 2);
 
-    _are_weights_reshaped = transpose_weights ? are_weights_reshaped : true;
+    _original_weights     = weights;
+    _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
     _is_fc_after_conv     = true;
     _accumulate_biases    = false;
 
@@ -141,25 +142,13 @@
         configure_fc_fc(input, weights_to_use, output);
     }
 
-    // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
-    if(!_are_weights_reshaped && !retain_internal_weights)
-    {
-        // Allocate the tensor for the weights reshaped
-        _reshape_weights_output.allocator()->allocate();
-    }
-
-    ARM_COMPUTE_ERROR_ON(retain_internal_weights && _reshape_weights_output.gc_buffer() == 0);
-    _are_weights_reshaped = _are_weights_reshaped || retain_internal_weights;
+    ARM_COMPUTE_ERROR_ON(fc_info.retain_internal_weights && _reshape_weights_output.gc_buffer() == 0);
+    _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
 }
 
 void GCFullyConnectedLayer::run()
 {
-    // Reshape of the weights (happens only once)
-    if(!_are_weights_reshaped)
-    {
-        _are_weights_reshaped = true;
-        _reshape_weights_kernel.run();
-    }
+    prepare();
 
     _memory_group.acquire();
 
@@ -187,3 +176,21 @@
 
     _memory_group.release();
 }
+
+void GCFullyConnectedLayer::prepare()
+{
+    // Reshape of the weights (happens only once)
+    if(!_are_weights_reshaped)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run reshape weights kernel and mark weights as unused
+        _reshape_weights_output.allocator()->allocate();
+        _reshape_weights_kernel.run();
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
+
+        _are_weights_reshaped = true;
+    }
+}
\ No newline at end of file
diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
index 79f8f71..8ae91ee 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
@@ -73,8 +73,8 @@
 } // namespace
 
 GCGEMM::GCGEMM(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false),
-      _is_first_run(true), _reshape_b_only_on_first_run(false)
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _original_b(nullptr), _is_interleaved_transposed(false),
+      _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
 {
 }
 
@@ -87,6 +87,8 @@
 
     // Check if we need to reshape the matrix B only on the first run
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+    _is_prepared                 = false;
+    _original_b                  = b;
 
     const IGCTensor *matrix_a = a;
     const IGCTensor *matrix_b = b;
@@ -136,7 +138,10 @@
     {
         // Allocate intermediate tensors
         _tmp_a.allocator()->allocate();
-        _tmp_b.allocator()->allocate();
+        if(!_reshape_b_only_on_first_run)
+        {
+            _tmp_b.allocator()->allocate();
+        }
     }
 
     // Configure matrix addition kernel
@@ -155,23 +160,21 @@
 
 void GCGEMM::run()
 {
+    prepare();
+
     _memory_group.acquire();
+
     if(_is_interleaved_transposed)
     {
         // Run interleave kernel
         GCScheduler::get().dispatch(_interleave_kernel, false);
 
-        if(_is_first_run)
-        {
-            // Run transpose kernel
-            GCScheduler::get().dispatch(_transpose_kernel, false);
-            _is_first_run = false;
-        }
-        else if(!_reshape_b_only_on_first_run)
+        if(!_reshape_b_only_on_first_run)
         {
             // Run transpose kernel
             GCScheduler::get().dispatch(_transpose_kernel, false);
         }
+
         GCScheduler::get().memory_barrier();
     }
 
@@ -184,5 +187,27 @@
         GCScheduler::get().memory_barrier();
         GCScheduler::get().dispatch(_ma_kernel);
     }
+
     _memory_group.release();
 }
+
+void GCGEMM::prepare()
+{
+    if(!_is_prepared)
+    {
+        if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+            // Run transpose kernel
+            _tmp_b.allocator()->allocate();
+            GCScheduler::get().dispatch(_transpose_kernel, false);
+            GCScheduler::get().memory_barrier();
+
+            // Mark original weights tensor as unused
+            _original_b->mark_as_unused();
+        }
+
+        _is_prepared = true;
+    }
+}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
index 1748a59..0c8769b 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
@@ -42,11 +42,11 @@
     ARM_COMPUTE_ERROR_ON(beta != 1.0f);
 
     // Create intermediate tensors shapes
-    _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+    _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type()));
 
     TensorShape shape = input->info()->tensor_shape();
     shape.set(0, 1);
-    TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
+    TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type());
     _max.allocator()->init(tensor_info_max_sum);
     _sum.allocator()->init(tensor_info_max_sum);
 
diff --git a/src/runtime/ITensorAllocator.cpp b/src/runtime/ITensorAllocator.cpp
index 8294201..087f324 100644
--- a/src/runtime/ITensorAllocator.cpp
+++ b/src/runtime/ITensorAllocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,13 +31,14 @@
 using namespace arm_compute;
 
 ITensorAllocator::ITensorAllocator()
-    : _info()
+    : _info(), _alignment(0)
 {
 }
 
-void ITensorAllocator::init(const TensorInfo &input)
+void ITensorAllocator::init(const TensorInfo &input, size_t alignment)
 {
-    _info = input;
+    _info      = input;
+    _alignment = alignment;
 }
 
 TensorInfo &ITensorAllocator::info()
@@ -49,3 +50,8 @@
 {
     return _info;
 }
+
+size_t ITensorAllocator::alignment() const
+{
+    return _alignment;
+}
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index c27ff2f..d72c98b 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,12 +61,12 @@
 void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value,
                             bool use_fp16)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(gradient_size < 3);
-    ARM_COMPUTE_ERROR_ON(gradient_size > 7);
-    ARM_COMPUTE_ERROR_ON(lower_thr > upper_thr);
     ARM_COMPUTE_ERROR_ON((1 != norm_type) && (2 != norm_type));
+    ARM_COMPUTE_ERROR_ON((gradient_size != 3) && (gradient_size != 5) && (gradient_size != 7));
+    ARM_COMPUTE_ERROR_ON((lower_thr < 0) || (lower_thr >= upper_thr));
 
     _output = output;
 
@@ -119,7 +119,7 @@
     }
     else
     {
-        ARM_COMPUTE_ERROR("Gradient size not supported\n");
+        ARM_COMPUTE_ERROR("Gradient size %d not supported\n", gradient_size);
     }
 
     // Manage intermediate buffers
@@ -171,24 +171,23 @@
 void NECannyEdge::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
-    ARM_COMPUTE_ERROR_ON(_output == nullptr);
 
     _memory_group.acquire();
 
     // Run sobelNxN
     _sobel->run();
 
-    // Fill border before non-maxima suppression. Nop for border mode undefined.
-    NEScheduler::get().schedule(&_border_mag_gradient, Window::DimZ);
-
     // Run gradient
     NEScheduler::get().schedule(_gradient.get(), Window::DimY);
 
+    // Fill border before non-maxima suppression. Nop for border mode undefined.
+    NEScheduler::get().schedule(&_border_mag_gradient, Window::DimZ);
+
     // Run non-maxima suppression
     NEScheduler::get().schedule(&_non_max_suppr, Window::DimY);
 
     ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
-    memset(_output->buffer(), 0, _output->info()->total_size());
+    std::fill_n(_output->buffer(), _output->info()->total_size(), 0);
 
     // Fill border before edge trace
     NEScheduler::get().schedule(&_border_edge_trace, Window::DimZ);
diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
new file mode 100644
index 0000000..21ab47d
--- /dev/null
+++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
+
+#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEWidthConcatenateLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+NEConcatenateLayer::NEConcatenateLayer()
+    : _concat_function(nullptr)
+{
+}
+
+void NEConcatenateLayer::configure(const std::vector<ITensor *> &inputs_vector, ITensor *output, DataLayoutDimension axis)
+{
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    switch(get_data_layout_dimension_index(output->info()->data_layout(), axis))
+    {
+        case 0:
+        {
+            auto func = support::cpp14::make_unique<NEWidthConcatenateLayer>();
+            func->configure(inputs_vector, output);
+            _concat_function = std::move(func);
+            break;
+        }
+        case 2:
+        {
+            auto func = support::cpp14::make_unique<NEDepthConcatenateLayer>();
+            func->configure(inputs_vector, output);
+            _concat_function = std::move(func);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Concatenation is supported across width and depth only!");
+    }
+}
+
+Status NEConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, DataLayoutDimension axis)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(output == nullptr);
+
+    switch(get_data_layout_dimension_index(output->data_layout(), axis))
+    {
+        case 0:
+            ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(inputs_vector, output));
+            break;
+        case 2:
+            ARM_COMPUTE_RETURN_ON_ERROR(NEDepthConcatenateLayer::validate(inputs_vector, output));
+            break;
+        default:
+            ARM_COMPUTE_RETURN_ERROR_MSG("Concatenation is supported across width and depth only!");
+    }
+    return Status{};
+}
+
+void NEConcatenateLayer::run()
+{
+    ARM_COMPUTE_ERROR_ON(_concat_function == nullptr);
+    _concat_function->run();
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 7053c7e..931e5db 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "support/ToolchainSupport.h"
 
 #include <cmath>
@@ -41,10 +42,11 @@
 }
 
 void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
+                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_UNUSED(num_groups);
     ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
                                                             enable_fast_math));
 
@@ -78,8 +80,10 @@
 }
 
 Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
+                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on NEON");
+
     switch(NEConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info))
     {
         case ConvolutionMethod::WINOGRAD:
@@ -108,6 +112,42 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights);
     ARM_COMPUTE_UNUSED(weights_info);
 
+    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+    const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+
+    /* Input spatial dims, kernel size, IFM/OFM, conv info*/
+    using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo>;
+    using ConfigurationMethod      = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
+
+    const std::vector<ConfigurationMethod> known_configs =
+    {
+        // Alexnet
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U)), ConvolutionMethod::GEMM),
+        // VGG16 / VGG19
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)), ConvolutionMethod::GEMM),
+        // Mobilenet 224
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM),
+        // Mobilenet 160
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM)
+    };
+
+    const auto find_config = [&](ConfigurationMethod c)
+    {
+        const ConvolutionConfiguration config = c.first;
+        const PadStrideInfo            info   = std::get<3>(config);
+
+        return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
+               && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
+               && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride();
+    };
+
+    std::vector<ConfigurationMethod>::const_iterator found;
+    if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
+    {
+        return (*found).second;
+    }
+
     if(dilation != Size2D(1U, 1U) || Scheduler::get().cpu_info().get_cpu_model() == CPUModel::A53
        || input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) <= 16)
     {
@@ -119,6 +159,12 @@
 
 void NEConvolutionLayer::run()
 {
+    prepare();
     _function->run();
 }
+
+void NEConvolutionLayer::prepare()
+{
+    _function->prepare();
+}
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NECopy.cpp b/src/runtime/NEON/functions/NECopy.cpp
new file mode 100644
index 0000000..efa8b89
--- /dev/null
+++ b/src/runtime/NEON/functions/NECopy.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NECopy.h"
+
+#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NECopy::configure(ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NECopyKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+Status NECopy::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output)
+{
+    return NECopyKernel::validate(input, output);
+}
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 40ada8f..fda9f57 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -38,7 +38,8 @@
       _scaled_output(),
       _input(nullptr),
       _info(),
-      _inner_border()
+      _inner_border(),
+      _is_prepared(false)
 {
 }
 
@@ -62,18 +63,15 @@
                                                     info.pad().first, info.pad().second, inner_border_right, inner_border_top, stride_x, stride_y);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, bias);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights, bias);
 
     if(bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, bias);
     }
 
     if(output->tensor_shape().total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
 
         const TensorShape output_shape = deconvolution_output_shape(out_dims, input->tensor_shape(), weights->tensor_shape());
 
@@ -104,6 +102,7 @@
     _input        = input;
     _info         = info;
     _inner_border = std::make_pair(inner_border_right, inner_border_top);
+    _is_prepared  = false;
 
     const unsigned int stride_x = info.stride().first;
     const unsigned int stride_y = info.stride().second;
@@ -115,8 +114,7 @@
 
     // configure scale function
     // Init and allocate intermmidiate tensor for output, same size as input but the first two axis are the same as the output tensor
-    const TensorInfo scale_out_info(compute_deconvolution_shape(*input->info(), stride_x, stride_y, inner_border_right, inner_border_top, info), 1, input->info()->data_type(),
-                                    input->info()->fixed_point_position());
+    const TensorInfo scale_out_info(compute_deconvolution_shape(*input->info(), stride_x, stride_y, inner_border_right, inner_border_top, info), 1, input->info()->data_type());
     _scaled_output.allocator()->init(scale_out_info);
 
     // setup the function to convolve the upscaled output
@@ -132,13 +130,21 @@
 
 void NEDeconvolutionLayer::run()
 {
+    prepare();
+
     _memory_group.acquire();
 
-    // Run upsample kernel
     _upsample_f.run();
-
-    // Run convolution layer
     _conv_f.run();
 
     _memory_group.release();
+}
+
+void NEDeconvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        _conv_f.prepare();
+        _is_prepared = true;
+    }
 }
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
index 930f8d5..49db855 100644
--- a/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
@@ -27,7 +27,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "support/ToolchainSupport.h"
 
@@ -41,18 +43,22 @@
 {
 }
 
-void NEDepthConcatenateLayer::configure(std::vector<ITensor *> inputs_vector, ITensor *output) // NOLINT
+void NEDepthConcatenateLayer::configure(const std::vector<ITensor *> &inputs_vector, ITensor *output) // NOLINT
 {
-    ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
-
     _num_inputs             = inputs_vector.size();
     _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<NEDepthConcatenateLayerKernel[]>(_num_inputs);
     _border_handlers_vector = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
 
-    TensorShape output_shape = calculate_depth_concatenate_shape(inputs_vector);
+    std::vector<ITensorInfo *> inputs_vector_info;
+    for(unsigned int i = 0; i < _num_inputs; i++)
+    {
+        inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
+    }
+    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(inputs_vector_info);
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type(), inputs_vector[0]->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
+    ARM_COMPUTE_ERROR_THROW_ON(NEDepthConcatenateLayer::validate(inputs_vector_info, output->info()));
 
     unsigned int depth_offset = 0;
     for(unsigned int i = 0; i < _num_inputs; ++i)
@@ -67,6 +73,27 @@
     output->info()->set_valid_region(ValidRegion(Coordinates(), output_shape));
 }
 
+Status NEDepthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
+
+    // Output auto inizialitation if not yet initialized
+    TensorInfo  tmp_output_info = *output->clone();
+    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(inputs_vector);
+    auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
+
+    unsigned int depth_offset = 0;
+    for(const auto &input : inputs_vector)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthConcatenateLayerKernel::validate(input, depth_offset, &tmp_output_info));
+        depth_offset += input->dimension(2);
+    }
+
+    return Status{};
+}
+
 void NEDepthConcatenateLayer::run()
 {
     for(unsigned i = 0; i < _num_inputs; ++i)
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 0a977ad..24b12f4 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -36,8 +36,8 @@
 using namespace arm_compute::misc::shape_calculator;
 
 NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
-    : _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _accumulator(), _input_nhwc(), _weights_hwio(), _output_nhwc(), _has_bias(false),
-      _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false), _is_nchw(true), _is_first_run(true)
+    : _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(),
+      _has_bias(false), _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false), _is_nchw(true), _is_first_run(true), _permute(false)
 {
 }
 
@@ -57,29 +57,31 @@
                                                                                           input->info()->data_layout());
     _are_weights_reshaped = false;
     _is_nchw              = input->info()->data_layout() == DataLayout::NCHW;
-
-    ARM_COMPUTE_ERROR_ON(!_is_optimized && !_is_nchw);
+    _permute              = _is_optimized == _is_nchw;
 
     if(_is_optimized)
     {
         if(_is_nchw)
         {
             // Configure the function to transform the input tensor from NCHW -> NHWC
-            _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
+            _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+            _permuted_input.info()->set_data_layout(DataLayout::NHWC);
 
             // Configure the function to transform the weights tensor from IHW -> HWI
-            _permute_weights.configure(weights, &_weights_hwio, PermutationVector(2U, 0U, 1U));
+            _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+            _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
 
             // Configure optimized depthwise
-            _dwc_kernel.configure(&_input_nhwc, &_weights_hwio, &_output_nhwc, conv_info, depth_multiplier, DataLayout::NHWC);
+            _dwc_kernel.configure(&_permuted_input, &_permuted_weights, &_permuted_output, conv_info, depth_multiplier, DataLayout::NHWC);
 
             // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
-            _permute_output.configure(&_output_nhwc, output, PermutationVector(1U, 2U, 0U));
+            _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
+            _permuted_output.info()->set_data_layout(DataLayout::NCHW);
 
             // Allocate tensors
-            _input_nhwc.allocator()->allocate();
-            _weights_hwio.allocator()->allocate();
-            _output_nhwc.allocator()->allocate();
+            _permuted_input.allocator()->allocate();
+            _permuted_weights.allocator()->allocate();
+            _permuted_output.allocator()->allocate();
         }
         else
         {
@@ -88,39 +90,88 @@
     }
     else
     {
-        // Allocate the intermediate accumulator tensor in case of fixed point input
+        // Allocate the intermediate accumulator tensor in case of quantized input
         if(_is_quantized)
         {
-            _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::S32));
+            TensorShape accum_shape = output->info()->tensor_shape();
+
+            if(!_is_nchw)
+            {
+                permute(accum_shape, PermutationVector(1U, 2U, 0U));
+            }
+
+            _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32));
             _accumulator.info()->set_quantization_info(input->info()->quantization_info());
             zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
         }
 
-        // Configure depthwise convolution kernel
-        _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier);
-
-        // Configure border handler
-        _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
-    }
-
-    // Configure biases accumulation
-    if(_has_bias || _is_quantized)
-    {
-        if(_is_quantized)
+        if(!_is_nchw)
         {
-            const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+            // Configure the function to transform the input tensor from NHWC -> NCHW
+            _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+            _permuted_input.info()->set_data_layout(DataLayout::NCHW);
 
-            float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
-            int   output_multiplier, output_shift;
-            quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-            _output_stage_kernel.configure(&_accumulator, biases, output, output_multiplier, output_shift, output_quant_info.offset);
-            _accumulator.allocator()->allocate();
+            // Configure the function to transform the weights tensor from HWI -> IHW
+            _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+            _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+
+            // Configure optimized depthwise
+            _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier);
+
+            // Configure border handler
+            _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+
+            // Allocate tensors
+            _permuted_input.allocator()->allocate();
+            _permuted_weights.allocator()->allocate();
         }
         else
         {
-            _output_stage_kernel.configure(output, biases);
+            // Configure depthwise convolution kernel
+            _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier);
+
+            // Configure border handler
+            _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
         }
     }
+
+    // Configure biases accumulation
+    if(_is_quantized)
+    {
+        const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+
+        float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+        int   output_multiplier, output_shift;
+        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+        _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, output_quant_info.offset);
+        _accumulator.allocator()->allocate();
+    }
+    else if(_has_bias)
+    {
+        _output_stage_kernel.configure((_is_nchw || _is_optimized) ? output : &_permuted_output, biases);
+    }
+
+    if(!_is_optimized && !_is_nchw)
+    {
+        // Configure the function to transform the convoluted output to NHWC
+        _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+        _permuted_output.allocator()->allocate();
+    }
+}
+
+Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                                unsigned int depth_multiplier)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW && input->data_layout() != DataLayout::NHWC);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+    }
+
+    return NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, output, conv_info, depth_multiplier);
 }
 
 void NEDepthwiseConvolutionLayer3x3::run()
@@ -132,32 +183,29 @@
         _dwc_kernel.generate_convolver();
     }
 
-    // Permute weights in HWIO format if the optimized kernel will be executedd
-    if(!_are_weights_reshaped && _is_optimized && _is_nchw)
+    // Permute weights
+    if(_permute)
     {
-        _are_weights_reshaped = true;
-        _permute_weights.run();
+        if(!_are_weights_reshaped)
+        {
+            _are_weights_reshaped = true;
+            _permute_weights.run();
+        }
+
+        _permute_input.run();
     }
 
     // Handle input
-    if(_is_optimized)
+    if(!_is_optimized)
     {
-        if(_is_nchw)
-        {
-            // Permute input to NHWC format execution
-            _permute_input.run();
-        }
-    }
-    else
-    {
-        // Fill border in NCHW format execution
+        // Fill border
         NEScheduler::get().schedule(&_border_handler, Window::DimX);
     }
 
     // Execute depthwise convolution
     NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
 
-    // Permute output to ACL's native NCHW format in case of NHWC execution
+    // Permute output
     if(_is_optimized && _is_nchw)
     {
         _permute_output.run();
@@ -168,27 +216,54 @@
     {
         NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
     }
+
+    // Permute output
+    if(!_is_optimized && !_is_nchw)
+    {
+        _permute_output.run();
+    }
 }
 
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
-    : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
-      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr)
+    : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _permute_input(),
+      _permute_weights(), _permute_output(), _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _permuted_input(), _permuted_weights(), _permuted_output(), _is_prepared(false),
+      _is_quantized(false), _is_nhwc(false), _original_weights(nullptr)
 {
 }
 
 void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
 {
+    const unsigned int channel_idx = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+    ARM_COMPUTE_UNUSED(channel_idx);
+
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_ERROR_ON((input->info()->dimension(2) * depth_multiplier) != weights->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON((input->info()->dimension(channel_idx) * depth_multiplier) != weights->info()->dimension(channel_idx));
 
-    const size_t weights_w = weights->info()->dimension(0);
-    const size_t weights_h = weights->info()->dimension(1);
-    const size_t weights_z = weights->info()->dimension(2);
+    _is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
+
+    ITensor       *input_to_use   = input;
+    const ITensor *weights_to_use = weights;
+    ITensor       *output_to_use  = output;
+
+    if(_is_nhwc)
+    {
+        _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+        _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+        input_to_use = &_permuted_input;
+
+        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+        _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+        weights_to_use = &_permuted_weights;
+    }
+
+    const size_t weights_w = weights_to_use->info()->dimension(0);
+    const size_t weights_h = weights_to_use->info()->dimension(1);
+    const size_t weights_z = weights_to_use->info()->dimension(2);
 
     _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _is_first_run     = true;
-    _original_weights = weights;
+    _is_prepared      = false;
+    _original_weights = weights_to_use;
 
     // Should bias be appended ?
     bool append_bias = (biases != nullptr) && !_is_quantized;
@@ -200,6 +275,14 @@
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
 
+    if(_is_nhwc)
+    {
+        permute(output_shape, PermutationVector(1U, 2U, 0U));
+        _permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+        _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+        output_to_use = &_permuted_output;
+    }
+
     // Output width and height
     const unsigned int conv_w = output_shape.x();
     const unsigned int conv_h = output_shape.y();
@@ -209,41 +292,50 @@
     const size_t conv_size  = conv_w * conv_h;
 
     // Im2Col configuration
-    TensorShape shape_im2col = input->info()->tensor_shape();
+    TensorShape shape_im2col = input_to_use->info()->tensor_shape();
     shape_im2col.set(0, patch_size);
     shape_im2col.set(1, conv_size);
     shape_im2col.set(2, weights_z);
-    _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
-    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
+    _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
+    _im2col_kernel.configure(input_to_use, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
 
     // Weights reshape configuration
     const TensorShape shape_weights_reshape(patch_size, weights_z);
-    _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
-    _weights_reshape_kernel.configure(weights, &_weights_reshaped, append_bias ? biases : nullptr);
+    _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape).set_data_layout(DataLayout::NCHW));
+    _weights_reshape_kernel.configure(weights_to_use, &_weights_reshaped, append_bias ? biases : nullptr);
 
     // GEMV configuration
     DataType    v2mm_dt        = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
-    TensorShape shape_v2mm_out = input->info()->tensor_shape();
+    TensorShape shape_v2mm_out = input_to_use->info()->tensor_shape();
     shape_v2mm_out.set(0, conv_size * weights_z);
     shape_v2mm_out.set(1, 1);
     shape_v2mm_out.set(2, 1);
-    _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
+    _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out).set_data_layout(DataLayout::NCHW));
     _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
     _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
-    _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
+    _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output_to_use, conv_w, conv_h);
 
     // Output staged configuration
     if(_is_quantized)
     {
-        const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+        const QuantizationInfo output_quant_info = output->info()->quantization_info();
 
         float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
         int   output_multiplier, output_shift;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-        _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
+        _output_stage_kernel.configure(&_output_reshaped, biases, output_to_use, output_multiplier, output_shift, output_quant_info.offset);
         _output_reshaped.allocator()->allocate();
     }
 
+    if(_is_nhwc)
+    {
+        _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+
+        _permuted_input.allocator()->allocate();
+        _permuted_weights.allocator()->allocate();
+        _permuted_output.allocator()->allocate();
+    }
+
     // Fill borders on inputs
     PixelValue zero_in(static_cast<int32_t>(0));
     PixelValue zero_w(static_cast<int32_t>(0));
@@ -260,23 +352,102 @@
 
     // Allocate intermediate tensors
     _input_reshaped.allocator()->allocate();
-    _weights_reshaped.allocator()->allocate();
     _v2mm_output.allocator()->allocate();
 }
 
+Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                             unsigned int depth_multiplier)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW && input->data_layout() != DataLayout::NHWC);
+
+    // Clone output to use auto init
+    auto output_clone = output->clone();
+
+    const ITensorInfo *input_to_use   = input;
+    const ITensorInfo *weights_to_use = weights;
+    const ITensorInfo *output_to_use  = output_clone.get();
+
+    TensorShape permuted_input_shape   = input->tensor_shape();
+    TensorShape permuted_weights_shape = weights->tensor_shape();
+    TensorInfo  permuted_input;
+    TensorInfo  permuted_weights;
+
+    if(input->data_layout() == DataLayout::NHWC)
+    {
+        permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
+        permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
+
+        permuted_input   = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NCHW));
+        permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW));
+
+        input_to_use   = &permuted_input;
+        weights_to_use = &permuted_weights;
+    }
+
+    const bool         is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+    const bool         append_bias  = (biases != nullptr) && !is_quantized;
+    TensorShape        output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+    const size_t       weights_w    = weights_to_use->dimension(0);
+    const size_t       weights_h    = weights_to_use->dimension(1);
+    const size_t       weights_z    = weights_to_use->dimension(2);
+    const unsigned int conv_w       = output_shape.x();
+    const unsigned int conv_h       = output_shape.y();
+    const size_t       patch_size   = weights_w * weights_h + (append_bias ? 1 : 0);
+    const size_t       conv_size    = conv_w * conv_h;
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output_clone, input->clone()->set_tensor_shape(output_shape));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+
+    TensorInfo permuted_output;
+    if(input->data_layout() == DataLayout::NHWC)
+    {
+        permute(output_shape, PermutationVector(1U, 2U, 0U));
+        permuted_output = TensorInfo(output_clone->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_layout(DataLayout::NCHW));
+        output_to_use   = &permuted_output;
+    }
+
+    // Im2Col configuration
+    TensorShape shape_im2col = input_to_use->tensor_shape();
+    shape_im2col.set(0, patch_size);
+    shape_im2col.set(1, conv_size);
+    shape_im2col.set(2, weights_z);
+    TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseIm2ColKernel::validate(input_to_use, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+
+    // Weights reshape configuration
+    const TensorShape shape_weights_reshape(patch_size, weights_z);
+    TensorInfo        weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape).set_data_layout(DataLayout::NCHW));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseWeightsReshapeKernel::validate(weights_to_use, &weights_reshaped, append_bias ? biases : nullptr));
+
+    // GEMV configuration
+    DataType    v2mm_dt        = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
+    TensorShape shape_v2mm_out = input_to_use->tensor_shape();
+    shape_v2mm_out.set(0, conv_size * weights_z);
+    shape_v2mm_out.set(1, 1);
+    shape_v2mm_out.set(2, 1);
+    TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out).set_data_layout(DataLayout::NCHW));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
+
+    TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_to_use->tensor_shape()));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output_to_use, conv_w, conv_h));
+
+    if(is_quantized)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output_to_use));
+    }
+
+    return Status{};
+}
+
 void NEDepthwiseConvolutionLayer::run()
 {
-    // Run weights reshaping (Runs once for every configure)
-    if(_is_first_run)
+    prepare();
+
+    if(_is_nhwc)
     {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
-        NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
-        _is_first_run = false;
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
+        _permute_input.run();
     }
 
     NEScheduler::get().schedule(&_im2col_kernel, Window::DimX);
@@ -287,4 +458,30 @@
     {
         NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
     }
+
+    if(_is_nhwc)
+    {
+        _permute_output.run();
+    }
+}
+
+void NEDepthwiseConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        if(_is_nhwc)
+        {
+            _permute_weights.run();
+        }
+
+        // Run reshape and mark original weights as unused
+        _weights_reshaped.allocator()->allocate();
+        NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
+        NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
+        _original_weights->mark_as_unused();
+
+        _is_prepared = true;
+    }
 }
diff --git a/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp
index d70a668..da2e49c 100644
--- a/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,6 +45,14 @@
 
 void NEDepthwiseSeparableConvolutionLayer::run()
 {
+    prepare();
+
     _depthwise_conv.run();
     _pointwise_conv.run();
+}
+
+void NEDepthwiseSeparableConvolutionLayer::prepare()
+{
+    _depthwise_conv.prepare();
+    _pointwise_conv.prepare();
 }
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 445864c..40e40c8 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -34,7 +34,7 @@
 using namespace arm_compute;
 
 NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false), _is_fixed_point(false),
+    : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false),
       _is_activationlayer_enabled(false), _dim_split(Window::DimZ)
 {
 }
@@ -54,26 +54,10 @@
     // Check if bias should be added in the convolution result
     _has_bias = (bias != nullptr);
 
-    // Allocate the intermediate accumulator tensor in case of fixed point input
-    _is_fixed_point = is_data_type_fixed_point(input->info()->data_type());
-    if(_is_fixed_point)
+    _conv_kernel.configure(input, weights, output, conv_info);
+    if(_has_bias)
     {
-        const DataType promoted_dt = (input->info()->data_type() == DataType::QS8) ? DataType::QS16 : DataType::QS32;
-        _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, promoted_dt, output->info()->fixed_point_position()));
-        _memory_group.manage(&_accumulator);
-        _conv_kernel.configure(input, weights, &_accumulator, conv_info);
-
-        // When no bias is provided, we need to downscale the accumulator tensor
-        _output_stage_kernel.configure(&_accumulator, bias, output);
-        _accumulator.allocator()->allocate();
-    }
-    else
-    {
-        _conv_kernel.configure(input, weights, output, conv_info);
-        if(_has_bias)
-        {
-            _output_stage_kernel.configure(output, bias);
-        }
+        _output_stage_kernel.configure(output, bias);
     }
 
     // Add zero padding XY
@@ -92,12 +76,7 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
 
-    DataType data_type = output->data_type();
-    if(is_data_type_fixed_point(data_type))
-    {
-        // Promote data type in case of fixed point
-        data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32);
-    }
+    DataType   data_type = output->data_type();
     TensorInfo accumulator(output->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type));
 
     // Validate Convolution kernel
@@ -129,7 +108,7 @@
     _memory_group.acquire();
 
     NEScheduler::get().schedule(&_conv_kernel, _dim_split);
-    if(_has_bias || _is_fixed_point)
+    if(_has_bias)
     {
         NEScheduler::get().schedule(&_output_stage_kernel, Window::DimY);
     }
diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp
index 32edf93..1814d61 100644
--- a/src/runtime/NEON/functions/NEFlattenLayer.cpp
+++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp
@@ -32,6 +32,6 @@
 void NEFlattenLayer::configure(const ITensor *input, ITensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEIm2ColKernel>();
-    k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, false, true);
+    k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, Size2D(1U, 1U), 1, false, true);
     _kernel = std::move(k);
 }
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 958d081..f1606aa 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include <algorithm>
@@ -35,120 +36,108 @@
 using namespace arm_compute;
 using namespace arm_compute::misc::shape_calculator;
 
-NEFullyConnectedLayerReshapeWeights::NEFullyConnectedLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
+namespace
 {
-}
-
-void NEFullyConnectedLayerReshapeWeights::configure(const ITensor *input, ITensor *output, bool transpose_weights, bool is_batched_fc_layer)
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerReshapeWeights::validate(input->info(), output->info(), transpose_weights, is_batched_fc_layer));
-
-    _transpose_weights   = transpose_weights;
-    _is_batched_fc_layer = is_batched_fc_layer;
-
-    // Check if we need to transpose the weights
-    if(_transpose_weights)
+    if(is_data_type_quantized_asymmetric(input.data_type()))
     {
-        if(_is_batched_fc_layer)
-        {
-            // Initialize the output tensor for transpose
-            _transpose_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*input->info())));
-            _memory_group.manage(&_transpose_output);
-            _transpose_kernel.configure(input, &_transpose_output);
+        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+        // Extract and negate input and weights offset
+        const QuantizationInfo input_quantization_info(input.quantization_info().scale, -input.quantization_info().offset);
+        const QuantizationInfo weights_quantization_info(weights.quantization_info().scale, -weights.quantization_info().offset);
 
-            // Configure transpose 1xW kernel
-            _transpose1xW_kernel.configure(&_transpose_output, output);
-
-            // Allocate temporary tensor used for transposing the weights
-            _transpose_output.allocator()->allocate();
-        }
-        else
-        {
-            _transpose_kernel.configure(input, output);
-        }
+        // Validate gemmlowp function
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(&input.clone()->set_quantization_info(input_quantization_info),
+                                                                           &weights.clone()->set_quantization_info(weights_quantization_info),
+                                                                           &output));
     }
     else
     {
-        if(_is_batched_fc_layer)
-        {
-            // Configure transpose 1xW kernel
-            _transpose1xW_kernel.configure(input, output);
-        }
-    }
-}
-
-Status NEFullyConnectedLayerReshapeWeights::validate(const ITensorInfo *input, const ITensorInfo *output, bool transpose_weights, bool is_batched_fc_layer)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!transpose_weights && !is_batched_fc_layer, "Configuration transpose_weights=false & is_batched_fc_layer=false not supported");
-
-    if(transpose_weights)
-    {
-        if(is_batched_fc_layer)
-        {
-            std::unique_ptr<ITensorInfo> use_output = output->clone();
-            use_output->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*input));
-
-            ARM_COMPUTE_RETURN_ON_ERROR(NETransposeKernel::validate(input, use_output.get()));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(use_output.get(), output));
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(NETransposeKernel::validate(input, output));
-        }
-    }
-    else
-    {
-        if(is_batched_fc_layer)
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(input, output));
-        }
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(&input, &weights, nullptr, &output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */)));
     }
 
     return Status{};
 }
+} // namespace
 
-void NEFullyConnectedLayerReshapeWeights::run()
+void NEFullyConnectedLayerReshapeWeights::configure(const ITensor *input, ITensor *output)
 {
-    _memory_group.acquire();
+    auto k = arm_compute::support::cpp14::make_unique<NETransposeKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
 
-    if(_transpose_weights)
-    {
-        NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
-    }
-
-    if(_is_batched_fc_layer)
-    {
-        NEScheduler::get().schedule(&_transpose1xW_kernel, Window::DimY);
-    }
-
-    _memory_group.release();
+Status NEFullyConnectedLayerReshapeWeights::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return NETransposeKernel::validate(input, output);
 }
 
 NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(),
-      _reshape_weights_output(), _are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false), _original_weights(nullptr)
+    : _memory_group(std::move(memory_manager)), _im2col_kernel(), _convert_weights(), _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(), _accumulate_biases_kernel(),
+      _im2col_output(), _gemmlowp_output(), _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr), _are_weights_converted(true), _are_weights_reshaped(false),
+      _is_fc_after_conv(false), _accumulate_biases(false), _is_quantized(false), _is_prepared(false)
 {
 }
 
-void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights, bool are_weights_reshaped)
+void NEFullyConnectedLayer::configure_mm(const ITensor *input, const ITensor *weights, ITensor *output)
 {
-    // With the Fully Connected layer we can have 4 different cases:
-    //  1) Convolution layer -> Fully Connected layer without batches
-    //  2) Fully Connected layer -> Fully Connected layer without batches
-    //  3) Convolution layer -> Fully Connected layer with batches
-    //  4) Fully Connected layer -> Fully Connected layer with batches
+    if(_is_quantized)
+    {
+        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+        // Extract and negate input and weights offset
+        const QuantizationInfo input_quantization_info   = input->info()->quantization_info();
+        const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
 
-    // Expected shape before transpose and reshaping
-    // Input: In x B (In and B can be multi-dimensional)
-    // Weights: flat(In) x Out
-    // Biases: Out
-    // Output: Out x B (B can be multi-dimensional)
+        input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
+        weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+
+        // Configure gemmlowp function
+        _mm_gemmlowp.configure(input, weights, output);
+
+        // Revert back QuantizatioInfo as input and weights could be used in other fully connected layers
+        input->info()->set_quantization_info(input_quantization_info);
+        weights->info()->set_quantization_info(weights_quantization_info);
+    }
+    else
+    {
+        // Configure matrix multiply kernel
+        _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */));
+    }
+}
+
+void NEFullyConnectedLayer::configure_conv_fc(const ITensor *input, const ITensor *weights, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+
+    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
+
+    // Initialize output tensor for im2col
+    TensorShape shape_im2col = compute_flatten_shape(input->info());
+    _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+
+    // Configure im2col kernel
+    _memory_group.manage(&_im2col_output);
+    _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, Size2D(1U, 1U), 1, true);
+
+    // Configure matrix multiply kernel
+    configure_mm(&_im2col_output, weights, output);
+
+    // Allocate the output tensor for im2col once all the configure methods have been called
+    _im2col_output.allocator()->allocate();
+}
+
+void NEFullyConnectedLayer::configure_fc_fc(const ITensor *input, const ITensor *weights, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+    // Configure matrix multiply kernel
+    configure_mm(input, weights, output);
+}
+
+void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output,
+                                      FullyConnectedLayerInfo fc_info)
+{
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
     // Perform validate step
@@ -156,165 +145,184 @@
                                                                weights->info(),
                                                                biases != nullptr ? biases->info() : nullptr,
                                                                output->info(),
-                                                               transpose_weights,
-                                                               are_weights_reshaped));
+                                                               fc_info));
 
-    const int    num_batch_dimensions = std::max(0, static_cast<int>(output->info()->tensor_shape().num_dimensions()) - 1);
-    const int    num_input_dimensions = input->info()->tensor_shape().num_dimensions() - num_batch_dimensions;
-    const size_t linear_input_size    = input->info()->tensor_shape().total_size_lower(num_input_dimensions);
+    _are_weights_converted = true;
+    _are_weights_reshaped  = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+    _is_fc_after_conv      = true;
+    _accumulate_biases     = false;
+    _is_quantized          = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _original_weights      = weights;
 
-    _original_weights     = weights;
-    _linearize_input      = (input->info()->tensor_shape().x() != linear_input_size) || (num_input_dimensions > 1 && linear_input_size == 1);
-    _are_weights_reshaped = are_weights_reshaped;
-    _accumulate_biases    = biases != nullptr;
-    _is_batched_fc_layer  = num_batch_dimensions > 0;
-
-    const size_t   interleave_width = 16 / input->info()->element_size();
-    const ITensor *weights_to_use   = weights;
-
-    if(!are_weights_reshaped && (transpose_weights || _is_batched_fc_layer))
+    // Configure gemmlowp output
+    if(_is_quantized)
     {
-        weights_to_use = &_reshape_weights_output;
-
-        _reshape_weights_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_fully_connected_reshaped_weights_shape(weights->info(),
-                                                  transpose_weights,
-                                                  _is_batched_fc_layer, interleave_width)));
-
-        // Reshape the weights
-        _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
+        _gemmlowp_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
     }
 
-    const ITensor *multiply_input = input;
-
-    if(_linearize_input)
+    // Configure accumulate biases kernel for non quantized asymmetric types
+    if(biases != nullptr && !_is_quantized)
     {
-        _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_fc_shape(input->info(), num_input_dimensions)));
+        _accumulate_biases = true;
 
-        // Configure im2col kernel
-        _memory_group.manage(&_im2col_output);
-        _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, true);
-
-        multiply_input = &_im2col_output;
-    }
-
-    int m = multiply_input->info()->dimension(1);
-    int k = multiply_input->info()->dimension(0);
-
-    if(_is_batched_fc_layer)
-    {
-        _interleave4x4_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_interleaved_shape(*multiply_input->info())));
-
-        // Configure interleave4x4 kernel
-        _memory_group.manage(&_interleave4x4_output);
-        _interleave4x4_kernel.configure(multiply_input, &_interleave4x4_output);
-
-        multiply_input = &_interleave4x4_output;
-    }
-
-    // Configure matrix multiply kernel
-    _mm_kernel.configure(multiply_input, weights_to_use, output, 1.0f, _is_batched_fc_layer, GEMMReshapeInfo(m, 0 /* no transpose */, k));
-
-    if(_accumulate_biases)
-    {
         // Configure accumulate biases kernel
         _accumulate_biases_kernel.configure(output, biases);
     }
 
-    // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
-    if(!are_weights_reshaped && (transpose_weights || _is_batched_fc_layer))
-    {
-        // Allocate the tensor for the weights reshaped
-        _reshape_weights_output.allocator()->allocate();
-    }
+    // With the Fully Connected layer we can have 4 different cases:
+    //  1) Convolution layer -> Fully Connected layer without batches
+    //  2) Fully Connected layer -> Fully Connected layer without batches
+    //  3) Convolution layer -> Fully Connected layer with batches
+    //  4) Fully Connected layer -> Fully Connected layer with batches
 
-    if(_linearize_input)
-    {
-        _im2col_output.allocator()->allocate();
-    }
+    const ITensor *weights_to_use = weights;
 
-    if(_is_batched_fc_layer)
-    {
-        _interleave4x4_output.allocator()->allocate();
-    }
-}
-
-Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose_weights, bool are_weights_reshaped)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, weights, output);
-
-    const int    num_batch_dimensions = std::max(0, static_cast<int>(output->tensor_shape().num_dimensions()) - 1);
-    const int    num_input_dimensions = input->tensor_shape().num_dimensions() - num_batch_dimensions;
-    const size_t linear_input_size    = input->tensor_shape().total_size_lower(num_input_dimensions);
-
-    const bool linearize_input     = (input->tensor_shape().x() != linear_input_size) || (num_input_dimensions > 1 && linear_input_size == 1);
-    const bool accumulate_biases   = biases != nullptr;
-    const bool is_batched_fc_layer = num_batch_dimensions > 0;
-
-    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size_upper(num_input_dimensions) != output->tensor_shape().total_size_upper(1));
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
-
-    const size_t                 interleave_width       = 16 / input->element_size();
-    const ITensorInfo           *weights_to_use         = weights;
-    std::unique_ptr<ITensorInfo> reshape_weights_output = input->clone();
-
-    if(!are_weights_reshaped && (transpose_weights || is_batched_fc_layer))
-    {
-        reshape_weights_output->set_tensor_shape(compute_fully_connected_reshaped_weights_shape(weights, transpose_weights, is_batched_fc_layer, interleave_width));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayerReshapeWeights::validate(weights, reshape_weights_output.get(), transpose_weights, is_batched_fc_layer));
-
-        weights_to_use = reshape_weights_output.get();
-    }
-
-    // Check correct shape of weights
+    // Check if we have a fully connected layer with batches
+    const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
     if(is_batched_fc_layer)
     {
-        // Transpose + Transpose1xW
-        ARM_COMPUTE_RETURN_ERROR_ON(weights_to_use->tensor_shape().x() != linear_input_size * interleave_width);
-        ARM_COMPUTE_RETURN_ERROR_ON(weights_to_use->tensor_shape().y() != static_cast<unsigned int>(std::ceil(static_cast<float>(output->tensor_shape().x()) / interleave_width)));
+        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                                                                  input->info()->tensor_shape().cend(),
+                                                                                  output->info()->tensor_shape().cbegin() + 1));
     }
     else
     {
-        // Transpose
-        ARM_COMPUTE_RETURN_ERROR_ON(weights_to_use->tensor_shape().x() != output->tensor_shape().x());
-        ARM_COMPUTE_RETURN_ERROR_ON(weights_to_use->tensor_shape().y() != linear_input_size);
+        _is_fc_after_conv = input->info()->num_dimensions() > 1;
     }
 
-    const ITensorInfo           *multiply_input       = input;
-    std::unique_ptr<ITensorInfo> im2col_output        = input->clone();
-    std::unique_ptr<ITensorInfo> interleave4x4_output = input->clone();
-
-    if(linearize_input)
+    // Reshape weights if needed
+    if(!_are_weights_reshaped)
     {
-        im2col_output->set_tensor_shape(compute_im2col_fc_shape(input, num_input_dimensions));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, im2col_output.get(), Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, true));
-
-        multiply_input = im2col_output.get();
+        // Reshape the weights
+        _reshape_weights_function.configure(weights, &_reshape_weights_output);
+        weights_to_use = &_reshape_weights_output;
     }
 
-    int m = multiply_input->dimension(1);
-    int k = multiply_input->dimension(0);
+    // Convert weights if needed
+    if(_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout))
+    {
+        // Convert weights
+        _convert_weights.configure(weights_to_use,
+                                   &_converted_weights_output,
+                                   input->info()->tensor_shape(),
+                                   fc_info.weights_trained_layout);
+
+        weights_to_use         = &_converted_weights_output;
+        _are_weights_converted = false;
+    }
+
+    ITensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output;
+    if(_is_fc_after_conv)
+    {
+        // Fully Connected layer after a Convolution Layer without batches
+        configure_conv_fc(input, weights_to_use, tmp_output);
+    }
+    else
+    {
+        // Fully Connected layer after a Fully Connected Layer without batches
+        configure_fc_fc(input, weights_to_use, tmp_output);
+    }
+
+    // Configure output stage for asymmetric quantized types
+    if(_is_quantized)
+    {
+        float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output->info()->quantization_info().scale;
+        int   output_multiplier, output_shift;
+        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+        _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset);
+        _gemmlowp_output.allocator()->allocate();
+    }
+
+    _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
+}
+
+Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+                                       FullyConnectedLayerInfo fc_info)
+{
+    ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+
+    bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+    bool is_fc_after_conv = true;
+    bool is_quantized     = is_data_type_quantized_asymmetric(input->data_type());
+
+    const ITensorInfo &im2col_input      = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input)));
+    const ITensorInfo &reshaped_weights  = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
+    const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
+    const ITensorInfo &gemmlowp_output   = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+
+    // Configure accumulate biases kernel for non quantized asymmetric types
+    if(biases != nullptr && !is_quantized)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases));
+    }
+
+    // With the Fully Connected layer we can have 4 different cases:
+    //  1) Convolution layer -> Fully Connected layer without batches
+    //  2) Fully Connected layer -> Fully Connected layer without batches
+    //  3) Convolution layer -> Fully Connected layer with batches
+    //  4) Fully Connected layer -> Fully Connected layer with batches
+
+    const ITensorInfo *input_to_use   = input;
+    const ITensorInfo *weights_to_use = weights;
+    const ITensorInfo *tmp_output     = (is_quantized) ? &gemmlowp_output : output;
+
+    // Check if we have a fully connected layer with batches
+    const bool is_batched_fc_layer = output->dimension(1) > 1;
 
     if(is_batched_fc_layer)
     {
-        interleave4x4_output->set_tensor_shape(compute_interleaved_shape(*multiply_input));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(multiply_input, interleave4x4_output.get()));
-
-        multiply_input = interleave4x4_output.get();
+        is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->tensor_shape().cbegin() + 3,
+                                                                                 input->tensor_shape().cend(),
+                                                                                 output->tensor_shape().cbegin() + 1));
+    }
+    else
+    {
+        is_fc_after_conv = input->num_dimensions() > 1;
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(multiply_input, weights_to_use, output, 1.0f, is_batched_fc_layer, GEMMReshapeInfo(m, 0 /* no transpose */, k)));
-
-    if(accumulate_biases)
+    if(!weights_reshaped)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->tensor_shape().x() != output->tensor_shape().x());
+        // Validate reshape weights kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
+        weights_to_use = &reshaped_weights;
+    }
 
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases));
+    if(is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout))
+    {
+        // Validate convert weights kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate(weights_to_use,
+                                                                             &converted_weights,
+                                                                             input->tensor_shape(),
+                                                                             fc_info.weights_trained_layout));
+        weights_to_use = &converted_weights;
+    }
+
+    if(is_fc_after_conv)
+    {
+        // Fully Connected layer after a Convolution Layer without batches
+        ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2))));
+
+        // Validate im2col kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_input, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, Size2D(1U, 1U), 1, true));
+        input_to_use = &im2col_input;
+    }
+    else
+    {
+        // Fully Connected layer after a Fully Connected Layer without batches
+        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
+    }
+    // Validate matrix multiply kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output));
+
+    // Validate output stage for asymmetric quantized types
+    if(is_quantized)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&gemmlowp_output, biases, output));
     }
 
     return Status{};
@@ -322,40 +330,94 @@
 
 void NEFullyConnectedLayer::run()
 {
-    // Reshape of the weights (happens only once)
-    if(!_are_weights_reshaped)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        _are_weights_reshaped = true;
-        _reshape_weights_kernel.run();
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
-    }
+    prepare();
 
     _memory_group.acquire();
 
     // Linearize input if it comes from a convolutional layer
-    if(_linearize_input)
+    if(_is_fc_after_conv)
     {
         NEScheduler::get().schedule(&_im2col_kernel, Window::DimY);
     }
 
-    // Interleave input
-    if(_is_batched_fc_layer)
+    // Run matrix multiply
+    if(_is_quantized)
     {
-        NEScheduler::get().schedule(&_interleave4x4_kernel, Window::DimY);
+        _mm_gemmlowp.run();
+    }
+    else
+    {
+        _mm_gemm.run();
     }
 
-    // Run matrix multiply
-    NEScheduler::get().schedule(&_mm_kernel, _is_batched_fc_layer ? Window::DimY : Window::DimX);
-
     // Accumulate biases if provided
-    if(_accumulate_biases)
+    if(_is_quantized)
     {
-        NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
+        _gemmlowp_output_stage.run();
+    }
+    else
+    {
+        if(_accumulate_biases)
+        {
+            NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
+        }
     }
 
     _memory_group.release();
 }
+
+void NEFullyConnectedLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        auto release_unused = [](Tensor * w)
+        {
+            if(!w->is_used())
+            {
+                w->allocator()->free();
+            }
+        };
+
+        // Pointer to current weights
+        const ITensor *cur_weights = _original_weights;
+
+        // Reshape of the weights (happens only once)
+        if(!_are_weights_reshaped)
+        {
+            // Run reshape weights kernel and mark weights as unused
+            _reshape_weights_output.allocator()->allocate();
+            _reshape_weights_function.run();
+
+            cur_weights->mark_as_unused();
+            cur_weights           = &_reshape_weights_output;
+            _are_weights_reshaped = true;
+        }
+
+        // Convert weights if needed (happens only once)
+        if(!_are_weights_converted)
+        {
+            _converted_weights_output.allocator()->allocate();
+            _convert_weights.run();
+
+            cur_weights->mark_as_unused();
+            _are_weights_converted = true;
+        }
+
+        // Release reshaped weights if unused
+        release_unused(&_reshape_weights_output);
+
+        // Prepare GEMM prepare and release unused weights
+        if(!_is_quantized)
+        {
+            _mm_gemm.prepare();
+        }
+
+        // Release converted weights if unused
+        release_unused(&_reshape_weights_output);
+        release_unused(&_converted_weights_output);
+
+        _is_prepared = true;
+    }
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 9168ed4..de51266 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -23,72 +23,56 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 
+#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/AssemblyHelper.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "support/ToolchainSupport.h"
 
 #include <cmath>
 
+using namespace arm_compute::misc::shape_calculator;
+
 namespace arm_compute
 {
 NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(), _B_pretransposed(),
-      _run_vector_matrix_multiplication(false), _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
+    : _memory_group(memory_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(memory_manager), _ma_kernel(), _tmp_a(), _tmp_b(), _original_b(nullptr),
+      _run_vector_matrix_multiplication(false), _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
 {
 }
 
 void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16, DataType::QS8, DataType::QS16);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, d);
-    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
-    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-
-    if(c != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16, DataType::QS8, DataType::QS16);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
-        ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
-        ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
-        ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(0) != d->info()->dimension(0), "The C matrix must have the same number of rows as the output matrix");
-        ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != d->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
-    }
+    ARM_COMPUTE_ERROR_THROW_ON(NEGEMM::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info));
 
     // Check if we need to reshape the matrix B only on the first run
+    _is_prepared                      = false;
     _reshape_b_only_on_first_run      = gemm_info.reshape_b_only_on_first_run();
     _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
+    _original_b                       = b;
 
-    const bool run_optimised = a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f)
-                               && setup_assembly_kernel(a, b, d, alpha, beta, _reshape_b_only_on_first_run, _workspace, _B_pretransposed, _memory_group, _asm_glue);
+    bool run_optimised = c == nullptr && bool(NEGEMMAssemblyDispatch::validate(a->info(), b->info(), d->info(), alpha, beta, _reshape_b_only_on_first_run));
 
-    // Check if the first input tensor is a vector.
-    // If so, all the kernels for reshaping the tensors can be skipped
-    if(_run_vector_matrix_multiplication)
+    if(run_optimised)
     {
-        if(!run_optimised)
+        _asm_glue.configure(a, b, d, alpha, beta, _reshape_b_only_on_first_run);
+        ARM_COMPUTE_ERROR_ON(!_asm_glue.is_configured());
+    }
+    else
+    {
+        if(_run_vector_matrix_multiplication)
         {
             // Configure the matrix multiply kernel
             _mm_kernel.configure(a, b, d, alpha, false);
         }
-
-        // Configure matrix addition kernel
-        if(beta != 0 && c != nullptr)
-        {
-            _ma_kernel.configure(c, d, beta);
-            _run_addition = true;
-        }
-    }
-    else
-    {
-        if(!run_optimised)
+        else
         {
             TensorShape shape_tmp_a = a->info()->tensor_shape();
             TensorShape shape_tmp_b = b->info()->tensor_shape();
@@ -100,8 +84,8 @@
             shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
             shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
 
-            TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
-            TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), a->info()->fixed_point_position());
+            TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+            TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
 
             _tmp_a.allocator()->init(info_a);
             _tmp_b.allocator()->init(info_b);
@@ -128,42 +112,135 @@
 
             // Allocate once the all configure methods have been called
             _tmp_a.allocator()->allocate();
-            _tmp_b.allocator()->allocate();
-
-            // Configure matrix addition kernel
-            if(beta != 0 && c != nullptr)
+            if(!_reshape_b_only_on_first_run)
             {
-                _ma_kernel.configure(c, d, beta);
-                _run_addition = true;
+                _tmp_b.allocator()->allocate();
             }
         }
+
+        // Configure matrix addition kernel
+        if(beta != 0 && c != nullptr)
+        {
+            _ma_kernel.configure(c, d, beta);
+            _run_addition = true;
+        }
     }
 }
 
+Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+
+    if(c != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), "The C matrix must have the same number of columns as the matrix B");
+    }
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
+        if(gemm_info.depth_output_gemm3d() != 1)
+        {
+            if(gemm_info.reinterpret_input_as_3d())
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
+            }
+            else
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
+            }
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+        }
+    }
+
+    // Check if we need to run the optimized assembly kernel
+    const bool run_optimised = c == nullptr && bool(NEGEMMAssemblyDispatch::validate(a, b, output, alpha, beta, true));
+
+    if(!run_optimised)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 1, "NEGEMM cannot reinterpret the output tensor as 3D");
+
+        // Check if the first input tensor is a vector.
+        const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
+        // Check if we need to reshape the matrix A and matrix B
+        const bool run_interleave_transpose = !run_vector_matrix_multiplication && !(gemm_info.reshape_b_only_on_first_run());
+
+        // Arguments used by GEMMReshapeInfo
+        // If we pass the matrix A and matrix B reshaped to NEGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to NEGEMMReshapeInfo
+        // in order to know how the matrices have been reshaped
+        const int m                         = a->dimension(1);
+        const int n                         = b->dimension(0);
+        const int k                         = a->dimension(0);
+        int       mult_transpose1xW_width   = 1;
+        int       mult_interleave4x4_height = 1;
+
+        const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d());
+
+        const ITensorInfo *matrix_a_info = a;
+        const ITensorInfo *matrix_b_info = b;
+
+        TensorInfo tmp_a_info{};
+        TensorInfo tmp_b_info{};
+        TensorInfo tmp_output_info = *output->clone();
+
+        if(run_interleave_transpose)
+        {
+            matrix_a_info = &tmp_a_info;
+            matrix_b_info = &tmp_b_info;
+
+            // Validate interleave kernel
+            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &tmp_a_info));
+
+            // Validate transpose kernel
+            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
+        }
+
+        // Validate matrix multiply
+        auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
+    }
+
+    return Status{};
+}
+
 void NEGEMM::run()
 {
-    _memory_group.acquire();
+    prepare();
 
-    if(_asm_glue._optimised_kernel != nullptr)
+    if(_asm_glue.is_configured())
     {
+        _memory_group.acquire();
         _asm_glue.run();
         _memory_group.release();
     }
     else
     {
+        _memory_group.acquire();
+
         if(!_run_vector_matrix_multiplication)
         {
             // Run interleave kernel
             NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
 
-            if(_is_first_run)
-            {
-                // Run transpose kernel
-                NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
-
-                _is_first_run = false;
-            }
-            else if(!_reshape_b_only_on_first_run)
+            if(!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
                 NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
@@ -181,4 +258,27 @@
         }
     }
 }
+
+void NEGEMM::prepare()
+{
+    if(!_is_prepared)
+    {
+        if(_asm_glue.is_configured())
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+            _asm_glue.prepare();
+        }
+        else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue.is_configured())
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+            _tmp_b.allocator()->allocate();
+            NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+            _original_b->mark_as_unused();
+        }
+
+        _is_prepared = true;
+    }
+}
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
new file mode 100644
index 0000000..29db654
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -0,0 +1,448 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h"
+#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h"
+#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h"
+#include "arm_compute/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h"
+#include "arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace
+{
+std::unique_ptr<IFunction> create_function_all_types(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
+                                                     std::shared_ptr<IMemoryManager> memory_manager)
+
+{
+    //Note: It's safe to not check for FP16 support because this was already checked in NEGEMMAssemblyDispatch::configure()
+    switch(method)
+    {
+        case arm_gemm::GemmMethod::GEMM_INTERLEAVED:
+        {
+            if(!pretranspose_hint)
+            {
+                return nullptr;
+            }
+            auto function = support::cpp14::make_unique<NEGEMMInterleavedWrapper>(memory_manager);
+            function->configure(a, b, d, alpha, beta, pretranspose_hint);
+            return std::move(function);
+        }
+        default:
+            return nullptr;
+    }
+}
+
+template <typename TypeInput, typename TypeOutput>
+std::unique_ptr<IFunction> create_function(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
+                                           std::shared_ptr<IMemoryManager> memory_manager)
+{
+    ARM_COMPUTE_UNUSED(method);
+    ARM_COMPUTE_UNUSED(a);
+    ARM_COMPUTE_UNUSED(b);
+    ARM_COMPUTE_UNUSED(d);
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(beta);
+    ARM_COMPUTE_UNUSED(pretranspose_hint);
+    ARM_COMPUTE_UNUSED(memory_manager);
+    return nullptr;
+}
+
+#ifdef __aarch64__
+template <>
+std::unique_ptr<IFunction> create_function<int8_t, int32_t>(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
+                                                            std::shared_ptr<IMemoryManager> memory_manager)
+{
+    switch(method)
+    {
+        case arm_gemm::GemmMethod::GEMM_INTERLEAVED_DOT:
+        {
+            if(!pretranspose_hint)
+            {
+                return nullptr;
+            }
+            auto function = support::cpp14::make_unique<NEGEMMInterleavedWrapper>(memory_manager);
+            function->configure(a, b, d, alpha, beta, pretranspose_hint, true /* use_dot */);
+            return std::move(function);
+        }
+        default:
+            return nullptr;
+    }
+    return nullptr;
+}
+
+template <>
+std::unique_ptr<IFunction> create_function<uint8_t, uint32_t>(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
+                                                              std::shared_ptr<IMemoryManager> memory_manager)
+{
+    switch(method)
+    {
+        case arm_gemm::GemmMethod::GEMM_INTERLEAVED_DOT:
+        {
+            if(!pretranspose_hint)
+            {
+                return nullptr;
+            }
+            auto function = support::cpp14::make_unique<NEGEMMInterleavedWrapper>(memory_manager);
+            function->configure(a, b, d, alpha, beta, pretranspose_hint, true /* use_dot */);
+            return std::move(function);
+        }
+        default:
+            return nullptr;
+    }
+    return nullptr;
+}
+
+template <>
+std::unique_ptr<IFunction> create_function<float, float>(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
+                                                         std::shared_ptr<IMemoryManager> memory_manager)
+{
+    ARM_COMPUTE_UNUSED(pretranspose_hint);
+    ARM_COMPUTE_UNUSED(memory_manager);
+    switch(method)
+    {
+        case arm_gemm::GemmMethod::GEMM_NATIVE:
+        {
+            auto kernel = support::cpp14::make_unique<NEGEMMNativeWrapperKernel<float, float>>();
+            kernel->configure(a, b, d, alpha, beta);
+            auto function = support::cpp14::make_unique<NESimpleAssemblyFunction>();
+            function->configure(std::move(kernel));
+            return std::move(function);
+        }
+        default:
+            return nullptr;
+    }
+}
+#endif /* __aarch64__ */
+
+/** Fallback in case ACL doesn't have a function */
+template <typename TypeInput, typename TypeOutput>
+class Fallback : public NEGEMMAssemblyDispatch::IFallback
+{
+public:
+    void configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs<TypeOutput> &args, MemoryGroup &memory_group);
+    void run() override;
+    void prepare() override;
+    bool is_configured() const override;
+
+private:
+    /** Allocate a workspace tensor.
+     *
+     * @param[in] workspace_size Size to allocate.
+     * @param[in] memory_group   Tensor memory group.
+     * @param[in] alignment      Workspace memory alignment.
+     */
+    void allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment);
+
+    /** Assembly Gemm kernel */
+    std::unique_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
+    /** Optimised NEON kernel */
+    std::unique_ptr<INEKernel> _optimised_kernel{ nullptr };
+    /** Input A */
+    const ITensor *_a
+    {
+        nullptr
+    };
+    /** Input B */
+    const ITensor *_b
+    {
+        nullptr
+    };
+    /** Output */
+    ITensor *_d{ nullptr };
+    /** GEMM workspace */
+    Tensor _workspace{};
+    /** Pre-transpose tensor */
+    Tensor _pretranspose{};
+    /** Prepared flag */
+    bool _is_prepared{ false };
+};
+
+template <typename TypeInput, typename TypeOutput>
+void Fallback<TypeInput, TypeOutput>::configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs<TypeOutput> &args, MemoryGroup &memory_group)
+{
+    _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput>(args, nullptr);
+    if(_gemm_kernel_asm == nullptr)
+    {
+        //configuration not supported: Leave function unconfigured:
+        return;
+    }
+
+    // arm_compute wrapper for the Gemm object (see above)
+    std::unique_ptr<NEGEMMAssemblyWrapperKernel<TypeInput, TypeOutput>> acl_gemm_wrapper = support::cpp14::make_unique<NEGEMMAssemblyWrapperKernel<TypeInput, TypeOutput>>();
+    ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr);
+    acl_gemm_wrapper->configure(_gemm_kernel_asm.get());
+    const size_t workspace_size = _gemm_kernel_asm->get_working_size();
+    if(workspace_size > 0)
+    {
+        // Allocate workspace
+        const unsigned int alignment = 4096;
+        allocate_workspace(workspace_size, memory_group, alignment);
+    }
+
+    //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and
+    //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001
+    {
+        const int window_size = _gemm_kernel_asm->get_window_size();
+        if(window_size < args._maxthreads)
+        {
+            _gemm_kernel_asm->set_nthreads(window_size);
+        }
+    }
+
+    _optimised_kernel = std::move(acl_gemm_wrapper);
+    _a                = a;
+    _b                = b;
+    _d                = d;
+    // Check for pre-transposed support
+    if(_gemm_kernel_asm->B_pretranspose_required())
+    {
+        // Forcing 128-byte alignment (required by 32-bit kernels)
+        const unsigned int alignment           = 128;
+        const size_t       B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
+        _pretranspose.allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment) }, 1, DataType::S8), alignment);
+        _pretranspose.allocator()->allocate();
+        ARM_COMPUTE_ERROR_ON_NULLPTR(_pretranspose.buffer());
+    }
+}
+
+template <typename TypeInput, typename TypeOutput>
+void Fallback<TypeInput, TypeOutput>::prepare()
+{
+    if(!_is_prepared)
+    {
+        // Pretranspose B if required
+        if(_gemm_kernel_asm->B_pretranspose_required())
+        {
+            ARM_COMPUTE_ERROR_ON(_pretranspose.buffer() == nullptr);
+            const int  ldb            = _b->info()->strides_in_bytes().y() / sizeof(TypeInput);
+            const auto in1_ptr        = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes());
+            const int  multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput);
+
+            _gemm_kernel_asm->pretranspose_B_array(_pretranspose.buffer(), in1_ptr, ldb, multi_stride_b);
+            _b->mark_as_unused();
+        }
+
+        _is_prepared = true;
+    }
+}
+
+template <typename TypeInput, typename TypeOutput>
+void Fallback<TypeInput, TypeOutput>::allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "size cannot be 0");
+    _workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment) }, 1, DataType::S8), alignment);
+    memory_group.manage(&_workspace);
+    _workspace.allocator()->allocate();
+}
+
+template <typename TypeInput, typename TypeOutput>
+bool Fallback<TypeInput, TypeOutput>::is_configured() const
+{
+    return _optimised_kernel != nullptr;
+}
+
+template <typename TypeInput, typename TypeOutput>
+void Fallback<TypeInput, TypeOutput>::run()
+{
+    const int lda = _a->info()->strides_in_bytes().y() / sizeof(TypeInput);
+    int       ldb = 0;
+    const int ldd = _d->info()->strides_in_bytes().y() / sizeof(TypeOutput);
+
+    // In the case of NHWC we want to interpret the output shape as 3D. Thus, the batch stride for A is
+    // the relevant multiple of the row stride.
+    const bool is_nhwc           = _a->info()->data_layout() == DataLayout::NHWC;
+    const int  stride_in_bytes_a = is_nhwc ? _a->info()->strides_in_bytes().y() * _d->info()->dimension(1) : _a->info()->strides_in_bytes().z();
+
+    const int batch_stride_a = stride_in_bytes_a / sizeof(TypeInput);
+    const int batch_stride_d = _d->info()->strides_in_bytes().z() / sizeof(TypeOutput);
+
+    const int multi_stride_a = _a->info()->strides_in_bytes()[3] / sizeof(TypeInput);
+    int       multi_stride_b = 0;
+    const int multi_stride_d = _d->info()->strides_in_bytes()[3] / sizeof(TypeOutput);
+
+    const auto       in0_ptr = reinterpret_cast<const TypeInput *>(_a->buffer() + _a->info()->offset_first_element_in_bytes());
+    const TypeInput *in1_ptr = nullptr;
+    auto             out_ptr = reinterpret_cast<TypeOutput *>(_d->buffer() + _d->info()->offset_first_element_in_bytes());
+
+    // Check if B is pre-tranposed and de-reference if not
+    if(!_gemm_kernel_asm->B_is_pretransposed())
+    {
+        ldb            = _b->info()->strides_in_bytes().y() / sizeof(TypeInput);
+        multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput);
+        in1_ptr        = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes());
+    }
+
+    // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
+    if(_workspace.buffer() != nullptr)
+    {
+        _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(_workspace.buffer()));
+        const unsigned int window_size = _gemm_kernel_asm->get_window_size();
+        unsigned int       num_threads = NEScheduler::get().num_threads();
+        if(window_size < num_threads)
+        {
+            num_threads = window_size;
+            _gemm_kernel_asm->set_nthreads(num_threads);
+        }
+    }
+
+    // Prepare assembly kernel
+    prepare();
+
+    // Set gemm parameters
+    _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr, ldd, batch_stride_d, multi_stride_d);
+
+    // Schedule assembly kernel
+    NEScheduler::get().schedule(_optimised_kernel.get(), Window::DimX);
+}
+
+template <typename TypeInput, typename TypeOutput>
+void create_function_or_arm_gemm(std::unique_ptr<IFunction> &acl_function, std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group, const ITensor *a, const ITensor *b,
+                                 ITensor *d, float alpha, float beta, bool pretranspose_hint, std::shared_ptr<IMemoryManager> memory_manager)
+{
+    INEGEMMWrapperKernel::Params p           = INEGEMMWrapperKernel::extract_parameters(a, b, d);
+    const CPUInfo               &ci          = NEScheduler::get().cpu_info();
+    unsigned int                 num_threads = NEScheduler::get().num_threads();
+
+    arm_gemm::GemmArgs<TypeOutput> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint);
+
+    //Try to create an ACL function:
+    acl_function = create_function_all_types(arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args), a, b, d, alpha, beta, pretranspose_hint, memory_manager);
+    // If the type agnostic factory failed to create an ACL function, try the specialised one:
+    if(acl_function == nullptr)
+    {
+        acl_function = create_function<TypeInput, TypeOutput>(arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args), a, b, d, alpha, beta, pretranspose_hint, memory_manager);
+    }
+    //If we still don't have an ACL function:
+    if(acl_function == nullptr)
+    {
+        //Fallback onto arm_gemm function if ACL doesn't support this method.
+        auto fallback = support::cpp14::make_unique<Fallback<TypeInput, TypeOutput>>();
+        fallback->configure(a, b, d, args, memory_group);
+        arm_gemm = std::move(fallback);
+    }
+}
+
+} //namespace
+
+NEGEMMAssemblyDispatch::NEGEMMAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager)
+    : _function(nullptr), _arm_gemm(nullptr), _memory_group(memory_manager), _memory_manager(memory_manager)
+{
+}
+
+Status NEGEMMAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, float alpha, float beta, bool pretranspose_hint)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(beta);
+    ARM_COMPUTE_UNUSED(pretranspose_hint);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
+#ifndef __aarch64__
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 || a->data_type() == DataType::S8 || a->data_type() == DataType::QASYMM8, "8bit integer types only supported for aarch64");
+#endif /* __aarch64__ */
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::U8, DataType::QASYMM8, DataType::S8, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && d->data_type() != DataType::S32 && d->data_type() != DataType::U32, "Only U32/S32 output supported for QASYMM8 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input");
+    return Status{};
+}
+
+void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(b);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(d);
+
+    //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
+    if(!NEGEMMAssemblyDispatch::validate(a->info(), b->info(), d->info(), alpha, beta, pretranspose_hint))
+    {
+        return;
+    }
+
+    switch(a->info()->data_type())
+    {
+        case DataType::F32:
+            create_function_or_arm_gemm<float, float>(_function, _arm_gemm, _memory_group, a, b, d, alpha, beta, pretranspose_hint, _memory_manager);
+            break;
+#ifdef __aarch64__
+        case DataType::U8:
+        case DataType::QASYMM8:
+            create_function_or_arm_gemm<uint8_t, uint32_t>(_function, _arm_gemm, _memory_group, a, b, d, alpha, beta, pretranspose_hint, _memory_manager);
+            break;
+        case DataType::S8:
+            create_function_or_arm_gemm<int8_t, int32_t>(_function, _arm_gemm, _memory_group, a, b, d, alpha, beta, pretranspose_hint, _memory_manager);
+            break;
+#endif /* __aarch64__ */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            create_function_or_arm_gemm<float16_t, float16_t>(_function, _arm_gemm, _memory_group, a, b, d, alpha, beta, pretranspose_hint, _memory_manager);
+            break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        default:
+            break;
+    }
+}
+
+void NEGEMMAssemblyDispatch::prepare()
+{
+    if(_function != nullptr)
+    {
+        _function->prepare();
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
+        _arm_gemm->prepare();
+    }
+}
+
+bool NEGEMMAssemblyDispatch::is_configured() const
+{
+    return (_arm_gemm != nullptr && _arm_gemm->is_configured()) || _function != nullptr;
+}
+
+void NEGEMMAssemblyDispatch::run()
+{
+    _memory_group.acquire();
+    if(_function != nullptr)
+    {
+        _function->run();
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
+        _arm_gemm->run();
+    }
+    _memory_group.release();
+}
+} //namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 2888b43..92e641e 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -23,10 +23,10 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
 
-#include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "support/ToolchainSupport.h"
@@ -34,98 +34,50 @@
 #include <cmath>
 #include <tuple>
 
-namespace
-{
-arm_compute::TensorShape get_reshaped_weights_shape(const arm_compute::ITensorInfo *weights, bool append_bias)
-{
-    const unsigned int mat_weights_cols = weights->dimension(3);
-    const unsigned int mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + (append_bias ? 1 : 0);
-    return arm_compute::TensorShape(mat_weights_cols, mat_weights_rows);
-}
-} // namespace
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
 
-namespace arm_compute
-{
-NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
+NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights()
+    : _weights_reshape_kernel()
 {
 }
 
-void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose1xW)
+void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output);
     ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayerReshapeWeights::validate(weights->info(),
                                                                           (biases != nullptr) ? biases->info() : nullptr,
-                                                                          output->info(),
-                                                                          transpose1xW));
+                                                                          output->info()));
 
-    // Check if bias are present, if yes they will be embedded to the weights matrix
-    const bool append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
-    //const unsigned bias_element  = (append_biases) ? 1 : 0;
+    const bool     append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
     const ITensor *biases_to_use = (append_biases) ? biases : nullptr;
 
-    _transpose1xW = transpose1xW;
-
-    if(transpose1xW)
-    {
-        // Create tensor to store the reshaped weights
-        TensorInfo info_wr = weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(get_reshaped_weights_shape(weights->info(), append_biases));
-
-        _weights_reshaped.allocator()->init(info_wr);
-        _memory_group.manage(&_weights_reshaped);
-
-        _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
-        _weights_transposed_kernel.configure(&_weights_reshaped, output);
-
-        _weights_reshaped.allocator()->allocate();
-    }
-    else
-    {
-        _weights_reshape_kernel.configure(weights, biases_to_use, output);
-    }
+    _weights_reshape_kernel.configure(weights, biases_to_use, output);
 
     output->info()->set_quantization_info(weights->info()->quantization_info());
 }
 
-Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose1xW)
+Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-    if(!is_data_type_quantized_asymmetric(weights->data_type()))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
-    }
-    // Check if bias are present, if yes they will be embedded to the weights matrix
-    const bool append_bias = (biases != nullptr);
 
-    if(append_bias)
+    if(biases != nullptr)
     {
+        const int idx_kernels = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
         ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(weights->data_type()));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
     }
 
-    // Checks performed when biases are present
-    if(append_bias)
+    if((output != nullptr) && (output->total_size() != 0))
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    }
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
 
-    if(transpose1xW)
-    {
-        TensorInfo weights_reshaped = weights->clone()->set_tensor_shape(get_reshaped_weights_shape(weights, append_bias));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, &weights_reshaped));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(&weights_reshaped, output));
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, output));
+        NEWeightsReshapeKernel::validate(weights, biases, output);
     }
 
     return Status{};
@@ -133,110 +85,21 @@
 
 void NEConvolutionLayerReshapeWeights::run()
 {
-    _memory_group.acquire();
-
     NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
-
-    if(_transpose1xW)
-    {
-        NEScheduler::get().schedule(&_weights_transposed_kernel, Window::DimY);
-    }
-
-    _memory_group.release();
 }
 
-namespace
-{
-TensorShape get_reshaped_weights_shape_conv(const ITensorInfo *weights, bool append_bias, bool is_fully_connected_convolution)
-{
-    unsigned int mat_weights_cols = weights->dimension(3);
-    unsigned int mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + (append_bias ? 1 : 0);
-
-    if(is_fully_connected_convolution)
-    {
-        // Create tensor to store the reshaped weights
-        return TensorShape(mat_weights_cols, mat_weights_rows);
-    }
-    else
-    {
-        // Create tensor to store transposed weights
-        const float transpose_width = 16.0f / weights->element_size();
-        return TensorShape(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
-    }
-}
-
-Status validate_and_initialize_values(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                      const ActivationLayerInfo &act_info, DataType &dt,
-                                      bool &append_bias, bool &skip_im2col,
-                                      bool &are_weights_reshaped, unsigned int &kernel_width, unsigned int &kernel_height,
-                                      bool &is_fully_connected_convolution, bool &is_interleaved, bool &is_quantized, bool &is_activationlayer_enabled,
-                                      unsigned int &mat_weights_cols, unsigned int &mat_weights_rows,
-                                      unsigned int &conv_w, unsigned int &conv_h, const Size2D &dilation)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
-
-    DataLayout data_layout = input->data_layout();
-    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int  idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(!weights_info.are_reshaped() && weights->dimension(idx_channel) != input->dimension(idx_channel));
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights_info.are_reshaped() && is_data_type_quantized_asymmetric(input->data_type()));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(data_layout == DataLayout::NHWC && input->data_type() != DataType::F32, "NHWC is only supported for FP32 data type.");
-
-    dt           = input->data_type();
-    is_quantized = is_data_type_quantized_asymmetric(dt);
-
-    if(biases != nullptr)
-    {
-        if(is_quantized)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON(!weights_info.are_reshaped() && biases->dimension(0) != weights->dimension(3));
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    }
-
-    // If we have 1x1 convolution and data layout is NHWC we can disable im2col
-    append_bias          = (biases != nullptr) && (!is_quantized);
-    are_weights_reshaped = weights_info.are_reshaped();
-    kernel_width         = (are_weights_reshaped) ? weights_info.kernel_size().first : weights->dimension(idx_width);
-    kernel_height        = (are_weights_reshaped) ? weights_info.kernel_size().second : weights->dimension(idx_height);
-    mat_weights_cols     = weights->dimension(3);
-    mat_weights_rows     = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel) + ((append_bias && !skip_im2col) ? 1 : 0);
-    skip_im2col          = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1);
-
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(idx_width), input->dimension(idx_height), kernel_width, kernel_height,
-                                                 conv_info, dilation);
-
-    // Check if its a "fully connected" convolution
-    is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
-    is_interleaved                 = (!is_fully_connected_convolution && !is_quantized);
-    is_activationlayer_enabled     = act_info.enabled();
-
-    return Status{};
-}
-} // namespace
-
 NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
-    : _asm_glue(), _memory_group(memory_manager), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(),
-      _output_col2im_kernel(), _activationlayer_function(), _add_bias_kernel(), _original_weights(nullptr), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(),
-      _tmp_output(), _workspace(), _B_pretransposed(), _data_layout(DataLayout::NCHW), _append_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false), _is_quantized(false),
-      _is_interleaved(false), _is_activationlayer_enabled(false), _skip_im2col(false)
+    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
+      _add_bias_kernel(), _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _append_bias(false),
+      _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
 {
 }
 
-void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *weights, ITensor *output, bool is_interleaved, const GEMMReshapeInfo &reshape_info)
+void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *weights, ITensor *output, int gemm_3d_depth)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), output->info(), gemm_3d_depth, _skip_im2col));
+
     if(_is_quantized)
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
@@ -255,128 +118,145 @@
     }
     else
     {
-        _mm_kernel.configure(input, weights, output, 1.f, is_interleaved, reshape_info);
+        // Configure matrix multiply function
+        _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, gemm_3d_depth,
+                                                                                 _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */));
     }
 }
 
-void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                       const Size2D &dilation, const ActivationLayerInfo &act_info)
+Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, int gemm_3d_depth, bool skip_im2col)
 {
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
 
-    DataType     dt{};
-    unsigned int kernel_width     = 0;
-    unsigned int kernel_height    = 0;
-    unsigned int mat_weights_cols = 0;
-    unsigned int mat_weights_rows = 0;
-    unsigned int conv_w           = 0;
-    unsigned int conv_h           = 0;
-
-    _data_layout           = input->info()->data_layout();
-    const bool is_nhwc     = _data_layout == DataLayout::NHWC;
-    const int  idx_width   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int  idx_height  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const int  idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-
-    Status status = validate_and_initialize_values(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), conv_info, weights_info, act_info, dt, _append_bias, _skip_im2col,
-                                                   _are_weights_reshaped,
-                                                   kernel_width, kernel_height,
-                                                   _is_fully_connected_convolution, _is_interleaved, _is_quantized, _is_activationlayer_enabled,
-                                                   mat_weights_cols, mat_weights_rows, conv_w, conv_h, dilation);
-
-    ARM_COMPUTE_ERROR_THROW_ON(status);
-
-    _original_weights                       = weights;
-    const unsigned int fixed_point_position = input->info()->fixed_point_position();
-    const ITensor     *biases_to_use        = (_append_bias) ? biases : nullptr;
-
-    bool run_optimised = dt == DataType::F32;
-
-    // Reshape weights if needed
-    if(run_optimised)
+    const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, skip_im2col);
+    if(is_quantized)
     {
-        TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
+        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+        // Extract and negate input and weights offset
+        const QuantizationInfo input_quantization_info   = input->quantization_info();
+        const QuantizationInfo weights_quantization_info = weights->quantization_info();
 
-        // Create tensor to store the reshaped weights
-        _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
-        _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
-        weights = &_weights_reshaped;
+        std::unique_ptr<ITensorInfo> input_qa   = input->clone();
+        std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
+        input_qa->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
+        weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+
+        // Perform validation step on GEMMLowp
+        return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), output, gemm_info);
     }
     else
     {
-        if(_are_weights_reshaped)
-        {
-            if(_is_fully_connected_convolution || _is_quantized)
-            {
-                mat_weights_cols = weights_info.num_kernels();
-                mat_weights_rows = weights->info()->dimension(idx_height);
-            }
-            else
-            {
-                mat_weights_cols = weights_info.num_kernels();
-                mat_weights_rows = weights_info.kernel_size().first * weights_info.kernel_size().second * input->info()->dimension(idx_channel) + (_append_bias ? 1 : 0);
-            }
-        }
-        else
-        {
-            TensorShape reshaped_weights_shape;
+        // Perform validation step on Matrix multiply function
+        return NEGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
+    }
+}
 
-            if(_is_fully_connected_convolution || _is_quantized)
-            {
-                reshaped_weights_shape = TensorShape{ mat_weights_cols, mat_weights_rows };
-            }
-            else
-            {
-                // Create tensor to store transposed weights
-                const float transpose_width = 16.0f / input->info()->element_size();
-                reshaped_weights_shape      = TensorShape{ mat_weights_rows *static_cast<unsigned int>(transpose_width),
-                                                           static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)) };
-            }
+Status NEGEMMConvolutionLayer::validate_gemm3d(DataType data_type, int gemm_3d_depth, bool skip_im2col)
+{
+    const bool         is_quantized          = is_data_type_quantized_asymmetric(data_type);
+    const DataType     output_gemm_data_type = is_quantized ? DataType::S32 : data_type;
+    const unsigned int mult_y                = skip_im2col ? 1U : gemm_3d_depth;
+    const unsigned int mult_z                = skip_im2col ? gemm_3d_depth : 1U;
 
-            // Create tensor to store the reshaped weights
-            _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
-            _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, _is_interleaved /* 1xW transpose */);
-            weights = &_weights_reshaped;
+    // Set dummy tensor shapes for the validation
+    const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type);
+    const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type);
+    const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, output_gemm_data_type);
+
+    return validate_mm(&dummy_input_info, &dummy_weights_info, &dummy_output_info, gemm_3d_depth, skip_im2col);
+}
+
+void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+                                       const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_UNUSED(num_groups);
+    ARM_COMPUTE_ERROR_THROW_ON(NEGEMMConvolutionLayer::validate(input->info(),
+                                                                weights->info(),
+                                                                biases != nullptr ? biases->info() : nullptr,
+                                                                output->info(),
+                                                                conv_info,
+                                                                weights_info,
+                                                                dilation,
+                                                                act_info,
+                                                                num_groups));
+
+    const DataType   data_type   = input->info()->data_type();
+    const DataLayout data_layout = input->info()->data_layout();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const int        idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+    const unsigned int kernel_width  = weights->info()->dimension(idx_width);
+    const unsigned int kernel_height = weights->info()->dimension(idx_height);
+
+    _is_prepared      = weights_info.retain_internal_weights();
+    _original_weights = weights;
+    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _data_layout      = data_layout;
+    _skip_im2col      = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+    _skip_col2im      = data_layout == DataLayout::NHWC;
+    _append_bias      = (biases != nullptr) && (!_is_quantized);
+
+    const ITensor *gemm_input_to_use         = input;
+    ITensor       *gemm_output_to_use        = output;
+    ITensor       *gemm_output_staged_to_use = output;
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(idx_width),
+                                                 input->info()->dimension(idx_height),
+                                                 kernel_width,
+                                                 kernel_height,
+                                                 conv_info,
+                                                 dilation);
+
+    // Check if GEMM3D is supported
+    if(_skip_col2im)
+    {
+        // If not supported, we need to perform im2col and col2im (or reshape layer)
+        if(!bool(validate_gemm3d(input->info()->data_type(), conv_h, _skip_im2col)))
+        {
+            _skip_im2col = false;
+            _skip_col2im = false;
         }
     }
 
-    // In case we skip im2col we have to add bias
+    const unsigned bias_element  = (_append_bias && !_skip_im2col) ? 1 : 0;
+    const ITensor *biases_to_use = (_append_bias && !_skip_im2col) ? biases : nullptr;
+
+    // Get parameters from conv_info
+    unsigned int stride_x = 0;
+    unsigned int stride_y = 0;
+    std::tie(stride_x, stride_y) = conv_info.stride();
+
+    unsigned int mat_weights_cols = weights->info()->dimension(idx_kernels);
+    unsigned int mat_weights_rows = weights->info()->dimension(idx_width) * weights->info()->dimension(idx_height) * weights->info()->dimension(idx_channel) + bias_element;
+
+    // _weights_reshaped will be auto configured in the kernel.
+    // Just append biases and do not transpose 1xW as it will be reshaped in NEGEMM
+    _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped);
+
+    // Create tensor to store im2col reshaped inputs
     if(!_skip_im2col)
     {
-        const unsigned int mat_input_cols = mat_weights_rows;
-        const unsigned int mat_input_rows = conv_w * conv_h;
-
-        // Create tensor to store im2col reshaped inputs
-        TensorShape shape_im2col(input->info()->tensor_shape());
-        shape_im2col.set(0, mat_input_cols);
-        shape_im2col.set(1, mat_input_rows);
+        // Calculate im2col shape
+        // For NEON the batch size is on the fourth dimension
+        TensorShape shape_im2col = input->info()->tensor_shape();
+        shape_im2col.set(0, mat_weights_rows);
+        shape_im2col.set(1, conv_w * conv_h);
         shape_im2col.set(2, 1);
-        _input_im2col_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
-        _memory_group.manage(&_input_im2col_reshaped);
 
-        // Create tensor (interleave) to prepare input tensor for GEMM
-        if(!_is_fully_connected_convolution && !run_optimised && _is_interleaved)
-        {
-            TensorShape shape_interleaved(shape_im2col);
-            shape_interleaved.set(idx_width, shape_interleaved.x() * 4);
-            shape_interleaved.set(idx_height, std::ceil(shape_interleaved[idx_height] / 4.f));
-            _input_interleaved_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_interleaved));
-            _memory_group.manage(&_input_interleaved_reshaped);
-        }
+        _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+        _memory_group.manage(&_im2col_output);
 
-        // Create GEMM output tensor
-        TensorShape shape_gemm(_input_im2col_reshaped.info()->tensor_shape());
-        shape_gemm.set(0, mat_weights_cols);
-        shape_gemm.set(1, mat_input_rows);
-        const DataType gemm_data_type = _is_quantized ? DataType::S32 : dt;
-        // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
-        TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->info()->fixed_point_position());
-        info_gemm.set_quantization_info(output->info()->quantization_info());
-        _gemm_output.allocator()->init(info_gemm);
+        // Configure
+        _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, _append_bias, dilation);
 
-        // Configure im2col
-        _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _append_bias, false, false, dilation);
+        // Update GEMM input
+        gemm_input_to_use = &_im2col_output;
     }
     else if(_append_bias)
     {
@@ -384,129 +264,187 @@
         _add_bias_kernel.configure(output, biases, output, ConvertPolicy::SATURATE);
     }
 
-    // Configure matrix multiply
-    if(run_optimised)
+    // Create temporary GEMM output tensor in case we cannot skip col2im
+    if(!_skip_col2im)
     {
-        if(!setup_assembly_kernel(_skip_im2col ? input : &_input_im2col_reshaped, weights, is_nhwc ? output : &_gemm_output, 1.f, 0.f, true, _workspace, _B_pretransposed, _memory_group, _asm_glue))
-        {
-            ARM_COMPUTE_ERROR("setup_assembly_kernel failed.");
-        }
-    }
-    else
-    {
-        if(_is_interleaved)
-        {
-            // Configure GEMMInterleave4x4. _input_interleaved_reshaped will be auto configured in the kernel
-            _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
+        // Calculate GEMM output shape
+        TensorShape shape_gemm = _im2col_output.info()->tensor_shape();
+        shape_gemm.set(0, mat_weights_cols);
+        shape_gemm.set(1, conv_w * conv_h);
 
-            // Configure GEMM
-            configure_mm(&_input_interleaved_reshaped, weights, &_gemm_output, _is_interleaved, GEMMReshapeInfo(_input_im2col_reshaped.info()->dimension(idx_height), 0 /* no transpose */,
-                                                                                                                _input_im2col_reshaped.info()->dimension(idx_width)));
-            _input_interleaved_reshaped.allocator()->allocate();
-        }
-        else
-        {
-            configure_mm(&_input_im2col_reshaped, weights, &_gemm_output, _is_interleaved);
-        }
+        // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
+        const DataType gemm_data_type = _is_quantized ? DataType::S32 : data_type;
+        TensorInfo info_gemm(shape_gemm, 1, gemm_data_type);
+        info_gemm.set_quantization_info(output->info()->quantization_info());
+        _gemm_output.allocator()->init(info_gemm);
+        _memory_group.manage(&_gemm_output);
+
+        // Update GEMM output
+        gemm_output_to_use = &_gemm_output;
     }
 
+    // Configure GEMM
+    configure_mm(gemm_input_to_use, &_weights_reshaped, gemm_output_to_use, _skip_col2im ? conv_h : 1);
+
     if(!_skip_im2col)
     {
-        _input_im2col_reshaped.allocator()->allocate();
+        _im2col_output.allocator()->allocate();
+    }
 
-        // Configure output stage for quantized case
-        if(_is_quantized)
+    // Configure output stage for quantized case
+    if(_is_quantized)
+    {
+        const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+
+        float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+        int   output_multiplier, output_shift;
+        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+        _memory_group.manage(&_tmp_output);
+        gemm_output_staged_to_use = &_tmp_output;
+
+        _gemmlowp_output_stage.configure(gemm_output_to_use, biases, gemm_output_staged_to_use, output_multiplier, output_shift, output_quant_info.offset);
+    }
+
+    if(!_skip_col2im)
+    {
+        if(_data_layout == DataLayout::NCHW)
         {
-            const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
-
-            float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
-            int   output_multiplier, output_shift;
-            quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-            _memory_group.manage(&_tmp_output);
-            _gemmlowp_output_stage.configure(&_gemm_output, biases, &_tmp_output, output_multiplier, output_shift, output_quant_info.offset);
+            // Configure col2im
+            _col2im_kernel.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output, Size2D(conv_w, conv_h));
         }
-
-        // Configure Col2Im
-        if(!is_nhwc)
+        else
         {
-            _output_col2im_kernel.configure(_is_quantized ? &_tmp_output : &_gemm_output, output, Size2D(conv_w, conv_h));
+            // Configure reshape layer
+            _reshape_layer.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output);
         }
+    }
 
-        if(_is_quantized)
-        {
-            _tmp_output.allocator()->allocate();
-        }
+    if(_is_quantized)
+    {
+        _tmp_output.allocator()->allocate();
+    }
+
+    if(!_skip_col2im)
+    {
         _gemm_output.allocator()->allocate();
     }
 
-    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h), "Output shape does not match the expected one");
-
-    // Allocate intermediate tensor
-    if(!_are_weights_reshaped)
-    {
-        _weights_reshaped.allocator()->allocate();
-    }
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h),
+                             "Output shape does not match the expected one");
 
     //Configure Activation Layer
+    _is_activationlayer_enabled = act_info.enabled();
+
     if(_is_activationlayer_enabled)
     {
         _activationlayer_function.configure(output, nullptr, act_info);
     }
+
+    ARM_COMPUTE_UNUSED(weights_info);
 }
 
 Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info)
+                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
 {
-    ARM_COMPUTE_UNUSED(output);
-
-    DataType     dt{};
-    bool         append_bias{};
-    bool         skip_im2col{};
-    bool         are_weights_reshaped{};
-    bool         is_fully_connected_convolution{};
-    bool         is_interleaved{};
-    bool         is_quantized{};
-    bool         is_activationlayer_enabled{};
-    unsigned int kernel_width     = 0;
-    unsigned int kernel_height    = 0;
-    unsigned int mat_weights_cols = 0;
-    unsigned int mat_weights_rows = 0;
-    unsigned int conv_w           = 0;
-    unsigned int conv_h           = 0;
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported on NEON");
 
     const DataLayout data_layout = input->data_layout();
-    const bool       is_nhwc     = data_layout == DataLayout::NHWC;
+    const DataType   data_type   = input->data_type();
     const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const int        idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
 
-    Status status = validate_and_initialize_values(input, weights, biases, conv_info, weights_info, act_info, dt, append_bias, skip_im2col, are_weights_reshaped, kernel_width, kernel_height,
-                                                   is_fully_connected_convolution, is_interleaved, is_quantized, is_activationlayer_enabled, mat_weights_cols, mat_weights_rows,
-                                                   conv_w, conv_h, dilation);
+    const unsigned int kernel_width  = weights->dimension(idx_width);
+    const unsigned int kernel_height = weights->dimension(idx_height);
 
-    const Size2D kernel_weights = Size2D(kernel_width, kernel_height);
+    TensorInfo         im2col_reshaped_info, info_gemm, tmp_info, weights_reshaped_info;
+    const ITensorInfo *gemm_input_to_use         = input;
+    const ITensorInfo *gemm_output_to_use        = output;
+    const ITensorInfo *gemm_output_staged_to_use = output;
+    const ITensorInfo *weights_to_use            = weights;
 
-    ARM_COMPUTE_RETURN_ON_ERROR(status);
+    const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
+    const bool append_bias  = (biases != nullptr) && (!is_quantized);
+    bool       skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+    bool       skip_col2im  = data_layout == DataLayout::NHWC;
 
-    std::unique_ptr<ITensorInfo> reshaped_weights = weights->clone();
-    bool                         optimised_kernel = false;
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
 
-    if(dt == DataType::F32)
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(idx_width),
+                                                 input->dimension(idx_height),
+                                                 kernel_width,
+                                                 kernel_height,
+                                                 conv_info,
+                                                 dilation);
+
+    // Check if GEMM3D is supported
+    if(skip_col2im)
     {
-        optimised_kernel = true;
+        // If not supported, we need to perform im2col and col2im (or reshape layer)
+        if(!bool(validate_gemm3d(input->data_type(), conv_h, skip_im2col)))
+        {
+            skip_im2col = false;
+            skip_col2im = false;
+        }
     }
 
-    const unsigned int mat_input_cols = mat_weights_rows;
-    const unsigned int mat_input_rows = conv_w * conv_h;
-    TensorShape        shape_im2col   = input->tensor_shape();
-    shape_im2col.set(0, mat_input_cols);
-    shape_im2col.set(1, mat_input_rows);
-    shape_im2col.set(2, 1);
-    TensorInfo im2_col_info = input->clone()->set_tensor_shape(shape_im2col);
+    const unsigned     bias_element  = (append_bias && !skip_im2col) ? 1 : 0;
+    const ITensorInfo *biases_to_use = (append_bias && !skip_im2col) ? biases : nullptr;
+
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != input->dimension(idx_channel));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+
+    // Validate biases
+    if(biases != nullptr)
+    {
+        if(is_quantized)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_ERROR_ON(act_info.b() > act_info.a());
+    }
+
+    unsigned int mat_weights_cols = weights->dimension(idx_kernels);
+    unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel) + bias_element;
+
+    // Output tensor auto inizialization if not yet initialized
+    ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases_to_use, nullptr));
+    weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, (append_bias && !skip_im2col)), 1, data_type);
+    weights_to_use        = &weights_reshaped_info;
 
     if(!skip_im2col)
     {
-        // Validate im2col
-        ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2_col_info, kernel_weights, conv_info, append_bias, false, false, dilation));
+        // Create tensor info for im2col reshaped inputs
+        // For NEON the batch size is on the fourth dimension
+        TensorShape shape_im2col = input->tensor_shape();
+        shape_im2col.set(0, mat_weights_rows);
+        shape_im2col.set(1, conv_w * conv_h);
+        shape_im2col.set(2, 1);
+
+        im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type);
+        im2col_reshaped_info.set_quantization_info(input->quantization_info());
+
+        ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
+        gemm_input_to_use = &im2col_reshaped_info;
     }
     else if(append_bias)
     {
@@ -514,66 +452,45 @@
         ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(output, biases, output, ConvertPolicy::SATURATE));
     }
 
-    // Create GEMM output tensor
-    TensorShape shape_gemm(im2_col_info.tensor_shape());
-    shape_gemm.set(0, mat_weights_cols);
-    shape_gemm.set(1, mat_input_rows);
-    TensorInfo gemm_output_info = input->clone()->set_tensor_shape(shape_gemm);
-
-    // Reshape weights if needed
-    if(optimised_kernel)
+    // Create temporary GEMM output tensor in case we cannot skip col2im
+    if(!skip_col2im)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(are_weights_reshaped);
+        TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
+        shape_gemm.set(0, mat_weights_cols);
+        shape_gemm.set(1, conv_w * conv_h);
+        const DataType gemm_data_type = is_quantized ? DataType::S32 : data_type;
+        // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
+        info_gemm = TensorInfo(shape_gemm, 1, gemm_data_type);
+        info_gemm.set_quantization_info(output->quantization_info());
 
-        // Create tensor to store the reshaped weights
-        reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, append_bias, is_fully_connected_convolution));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
-    }
-    else if(!is_quantized)
-    {
-        TensorShape reshaped_weights_shape;
-
-        if(is_fully_connected_convolution || is_quantized)
-        {
-            reshaped_weights_shape = TensorShape{ mat_weights_cols, mat_weights_rows };
-        }
-        else
-        {
-            // Create tensor to store transposed weights
-            const float transpose_width = 16.0f / input->element_size();
-            reshaped_weights_shape      = TensorShape{ mat_weights_rows *static_cast<unsigned int>(transpose_width),
-                                                       static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)) };
-        }
-
-        // Create tensor to store the reshaped weights
-        reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, append_bias, is_fully_connected_convolution));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
-        weights = reshaped_weights.get();
-
-        // Validate GEMM interleave and multiply
-        if(is_interleaved)
-        {
-            TensorShape shape_interleaved = shape_im2col;
-            shape_interleaved.set(idx_width, shape_interleaved.x() * 4);
-            shape_interleaved.set(idx_height, std::ceil(shape_interleaved.y() / 4.f));
-            TensorInfo input_interleaved_info = input->clone()->set_tensor_shape(shape_interleaved);
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(&im2_col_info, &input_interleaved_info));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&input_interleaved_info, weights, &gemm_output_info, 1.f, is_interleaved, GEMMReshapeInfo(shape_im2col[1],            // m
-                                                                             weights->tensor_shape()[0], // n
-                                                                             shape_im2col[0]) /* k */));
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&im2_col_info, weights, &gemm_output_info, 1.f, is_interleaved, GEMMReshapeInfo()));
-        }
-    }
-    if(!is_nhwc)
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
+        gemm_output_to_use = &info_gemm;
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(idx_width) != conv_w) || (output->dimension(idx_height) != conv_h), "Output shape does not match the expected one");
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, gemm_output_to_use, skip_col2im ? conv_h : 1, skip_im2col));
 
+    if(is_quantized)
+    {
+        float multiplier = input->quantization_info().scale * weights_to_use->quantization_info().scale / output->quantization_info().scale;
+        int   output_multiplier, output_shift;
+        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+        tmp_info = TensorInfo(gemm_output_to_use->tensor_shape(), 1, DataType::QASYMM8);
+        tmp_info.set_quantization_info(output->quantization_info());
+        gemm_output_staged_to_use = &tmp_info;
+
+        // Validate output stage for quantized case
+        NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(gemm_output_to_use, biases, gemm_output_staged_to_use, output->quantization_info().offset);
+    }
+
+    // Validate Col2Im/ReshapeLayer
+    if(!skip_col2im && (data_layout == DataLayout::NCHW))
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(is_quantized ? gemm_output_staged_to_use : gemm_output_to_use,
+                                                             output,
+                                                             Size2D(conv_w, conv_h)));
+    }
+
+    //Validate Activation Layer
     if(act_info.enabled())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
@@ -584,54 +501,30 @@
 
 void NEGEMMConvolutionLayer::run()
 {
-    // Run weights reshaping (Runs once for every configure)
-    if(!_are_weights_reshaped)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        _are_weights_reshaped = true;
-        _reshape_weights.run();
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
-    }
+    prepare();
 
     _memory_group.acquire();
 
     if(!_skip_im2col)
     {
         // Run input reshaping
-        unsigned int _y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-        NEScheduler::get().schedule(&_input_im2col_kernel, _y_dim);
+        unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+        NEScheduler::get().schedule(&_im2col_kernel, y_dim);
     }
 
-    // Runs matrix multiply on reshaped matrices
-    if(_asm_glue._optimised_kernel != nullptr)
+    // Runs NEGEMM or NEGEMMLowpMatrixMultiplyCore functions
+    if(_is_quantized)
     {
-        _asm_glue.run();
-        // Release weights in case buffer is pretransposed
-        if(!_weights_reshaped.is_used())
-        {
-            _weights_reshaped.allocator()->free();
-        }
+        // Run gemmlowp
+        _mm_gemmlowp.run();
+
+        // Run output stage
+        _gemmlowp_output_stage.run();
     }
     else
     {
-        if(_is_interleaved)
-        {
-            // Run interleave
-            NEScheduler::get().schedule(&_input_interleave_kernel, Window::DimY);
-        }
-
-        // Runs matrix multiply on reshaped matrices
-        if(_is_quantized)
-        {
-            _mm_gemmlowp.run();
-        }
-        else
-        {
-            NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
-        }
+        // Run gemm
+        _mm_gemm.run();
     }
 
     if(_skip_im2col && _append_bias)
@@ -639,16 +532,17 @@
         NEScheduler::get().schedule(&_add_bias_kernel, Window::DimY);
     }
 
-    // Run output stage for quantized case
-    if(_is_quantized)
-    {
-        _gemmlowp_output_stage.run();
-    }
-
     // Reshape output matrix
-    if(_data_layout == DataLayout::NCHW)
+    if(!_skip_col2im)
     {
-        NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+        if(_data_layout == DataLayout::NCHW)
+        {
+            NEScheduler::get().schedule(&_col2im_kernel, Window::DimY);
+        }
+        else
+        {
+            _reshape_layer.run();
+        }
     }
 
     if(_is_activationlayer_enabled)
@@ -658,4 +552,25 @@
 
     _memory_group.release();
 }
-} // namespace arm_compute
+
+void NEGEMMConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run weights reshaping and mark original weights tensor as unused
+        _weights_reshaped.allocator()->allocate();
+        _reshape_weights.run();
+        _original_weights->mark_as_unused();
+
+        // Prepare GEMM
+        _is_quantized ? _mm_gemmlowp.prepare() : _mm_gemm.prepare();
+        if(!_weights_reshaped.is_used())
+        {
+            _weights_reshaped.allocator()->free();
+        }
+
+        _is_prepared = true;
+    }
+}
diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
index 98b4767..47c3358 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
@@ -38,8 +38,7 @@
 using namespace arm_compute;
 
 NEGEMMLowpAssemblyMatrixMultiplyCore::NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _asm_glue_unsigned(), _asm_glue_signed(), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b(),
-      _workspace(), _B_pretransposed()
+    : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b()
 {
 }
 
@@ -53,18 +52,14 @@
     ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
 
     bool run_optimised = false;
-#ifdef __aarch64__
     switch(a->info()->data_type())
     {
         case DataType::S8:
-        {
-            run_optimised = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretransposed, _memory_group, _asm_glue_signed);
-            break;
-        }
         case DataType::QASYMM8:
         case DataType::U8:
         {
-            run_optimised = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretransposed, _memory_group, _asm_glue_unsigned);
+            _asm_glue.configure(a, b, output, 1.f, 0.f, true);
+            run_optimised = _asm_glue.is_configured();
             break;
         }
         default:
@@ -73,7 +68,6 @@
             break;
         }
     }
-#endif /* __aarch64__ */
     if(!run_optimised)
     {
         // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
@@ -133,13 +127,9 @@
         NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
     }
 
-    if(_asm_glue_unsigned._optimised_kernel != nullptr)
+    if(_asm_glue.is_configured())
     {
-        _asm_glue_unsigned.run();
-    }
-    else if(_asm_glue_signed._optimised_kernel != nullptr)
-    {
-        _asm_glue_signed.run();
+        _asm_glue.run();
     }
     else
     {
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 2e06fa2..828011d 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -41,9 +41,9 @@
 using namespace arm_compute::misc::shape_calculator;
 
 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _asm_glue_unsigned(), _asm_glue_signed(), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(),
-      _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _B_pretranspose(), _a_offset(0), _b_offset(0),
-      _run_vector_matrix_multiplication(false), _dot_product_path(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
+    : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
+      _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _original_b(nullptr), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false),
+      _dot_product_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
 {
 }
 
@@ -52,23 +52,27 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
     ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));
 
+    // Clear state
+    _mtx_a_reshape_kernel = nullptr;
+    _mtx_b_reshape_kernel = nullptr;
+
+    // Set internal variables
     _a_offset                         = a->info()->quantization_info().offset;
     _b_offset                         = b->info()->quantization_info().offset;
     _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
     _reshape_b_only_on_first_run      = gemm_info.reshape_b_only_on_first_run();
+    _is_prepared                      = false;
+    _original_b                       = b;
 
 #ifdef __aarch64__
     switch(a->info()->data_type())
     {
-        case DataType::S8:
-        {
-            _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretranspose, _memory_group, _asm_glue_signed);
-            break;
-        }
         case DataType::QASYMM8:
         case DataType::U8:
+        case DataType::S8:
         {
-            _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretranspose, _memory_group, _asm_glue_unsigned);
+            _asm_glue.configure(a, b, output, 1.f, 0.f, _reshape_b_only_on_first_run);
+            _dot_product_path = _asm_glue.is_configured();
             break;
         }
         default:
@@ -160,10 +164,13 @@
     if(!_dot_product_path && !_run_vector_matrix_multiplication)
     {
         _tmp_a.allocator()->allocate();
-        _tmp_b.allocator()->allocate();
+        if(!_reshape_b_only_on_first_run)
+        {
+            _tmp_b.allocator()->allocate();
+        }
     }
 
-    if(_a_offset != 0)
+    if(_a_offset != 0 && !_reshape_b_only_on_first_run)
     {
         _vector_sum_col.allocator()->allocate();
     }
@@ -188,6 +195,8 @@
     ARM_COMPUTE_UNUSED(gemm_info);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMMLowpMatrixMultiplyCore cannot reinterpret the input tensor as 3D");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 1, "NEGEMMLowpMatrixMultiplyCore cannot reinterpret the output tensor as 3D");
 
     int32_t a_offset                         = a->quantization_info().offset;
     int32_t b_offset                         = b->quantization_info().offset;
@@ -248,29 +257,24 @@
 
 void NEGEMMLowpMatrixMultiplyCore::run()
 {
+    prepare();
+
     _memory_group.acquire();
 
-    // Do not reshape if we run the vector-by-matrix case and we do not have the optimized gemm with dot product instruction
-    if(!_run_vector_matrix_multiplication && !_dot_product_path)
+    // Reshape inputs
+    if(_mtx_a_reshape_kernel)
     {
-        if(_mtx_a_reshape_kernel)
-        {
-            NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
-        }
-
-        if(_mtx_b_reshape_kernel && (_is_first_run || !_reshape_b_only_on_first_run))
-        {
-            NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
-        }
+        NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
+    }
+    if(_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run)
+    {
+        NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
     }
 
-    if(_asm_glue_unsigned._optimised_kernel != nullptr)
+    // Run GEMM
+    if(_asm_glue.is_configured())
     {
-        _asm_glue_unsigned.run();
-    }
-    else if(_asm_glue_signed._optimised_kernel != nullptr)
-    {
-        _asm_glue_signed.run();
+        _asm_glue.run();
     }
     else
     {
@@ -284,7 +288,7 @@
     }
 
     // Run matrix B reduction kernel only if _a_offset is not equal to 0
-    if(_a_offset != 0 && (_is_first_run || !_reshape_b_only_on_first_run))
+    if(_a_offset != 0 && !_reshape_b_only_on_first_run)
     {
         NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
     }
@@ -293,6 +297,38 @@
     NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
 
     _memory_group.release();
+}
 
-    _is_first_run = false;
+void NEGEMMLowpMatrixMultiplyCore::prepare()
+{
+    if(!_is_prepared)
+    {
+        // Run assembly reshape
+        if(_asm_glue.is_configured() && _reshape_b_only_on_first_run)
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+            _asm_glue.prepare();
+            _original_b->mark_as_unused();
+        }
+        // Run non-assembly reshape
+        else if(_mtx_b_reshape_kernel && _reshape_b_only_on_first_run)
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+            // Run reshape kernel and mark original weights tensor as unused
+            _tmp_b.allocator()->allocate();
+            NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+            _original_b->mark_as_unused();
+        }
+
+        // Run matrix B reduction kernel only if _a_offset is not equal to 0
+        if(_a_offset != 0 && _reshape_b_only_on_first_run)
+        {
+            _vector_sum_col.allocator()->allocate();
+            NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+        }
+
+        _is_prepared = true;
+    }
 }
diff --git a/src/runtime/NEON/functions/NEIm2Col.cpp b/src/runtime/NEON/functions/NEIm2Col.cpp
index 6b95cb0..4245b65 100644
--- a/src/runtime/NEON/functions/NEIm2Col.cpp
+++ b/src/runtime/NEON/functions/NEIm2Col.cpp
@@ -34,16 +34,18 @@
 {
 }
 
-void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten)
+void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups,
+                         bool is_fully_connected, bool is_flatten)
 {
     _y_dim = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
 
-    _kernel.configure(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten);
+    _kernel.configure(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten);
 }
 
-Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten)
+Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
+                          unsigned int num_groups, bool is_fully_connected, bool is_flatten)
 {
-    return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten);
+    return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten);
 }
 
 void NEIm2Col::run()
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index 913acf8..80a2541 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -73,7 +73,7 @@
 
 NELocallyConnectedLayer::NELocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
-      _is_first_run(false), _original_weights(nullptr)
+      _is_prepared(false), _original_weights(nullptr)
 {
 }
 
@@ -113,7 +113,7 @@
     TensorInfo input_im2col_reshaped_info(shape_im2col, 1, input->data_type());
     TensorInfo gemm_output_info(shape_gemm, 1, input->data_type());
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias, false));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias));
     ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, &weights_reshaped_info));
     ARM_COMPUTE_RETURN_ON_ERROR(NELocallyConnectedMatrixMultiplyKernel::validate(&input_im2col_reshaped_info, &weights_reshaped_info, &gemm_output_info));
     ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
@@ -127,7 +127,7 @@
     ARM_COMPUTE_ERROR_THROW_ON(NELocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info));
 
     bool _has_bias    = (biases != nullptr);
-    _is_first_run     = true;
+    _is_prepared      = false;
     _original_weights = weights;
 
     const unsigned int kernel_width  = weights->info()->dimension(0);
@@ -160,24 +160,13 @@
     _output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h));
 
     // Allocate intermediate tensors
-    _weights_reshaped.allocator()->allocate();
     _input_im2col_reshaped.allocator()->allocate();
     _gemm_output.allocator()->allocate();
 }
 
 void NELocallyConnectedLayer::run()
 {
-    // Run weights reshaping (Runs once for every configure)
-    if(_is_first_run)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        _is_first_run = false;
-        NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
-    }
+    prepare();
 
     _memory_group.acquire();
 
@@ -192,3 +181,18 @@
 
     _memory_group.release();
 }
+
+void NELocallyConnectedLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run weights reshaping and mark original weights tensor as unused
+        _weights_reshaped.allocator()->allocate();
+        NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+        _original_weights->mark_as_unused();
+
+        _is_prepared = true;
+    }
+}
diff --git a/src/runtime/NEON/functions/NEMagnitude.cpp b/src/runtime/NEON/functions/NEMagnitude.cpp
index f865054..2738201 100644
--- a/src/runtime/NEON/functions/NEMagnitude.cpp
+++ b/src/runtime/NEON/functions/NEMagnitude.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,36 +31,18 @@
 
 using namespace arm_compute;
 
-void NEMagnitude::configure(const ITensor *input1, const ITensor *input2, ITensor *output, MagnitudeType mag_type, bool use_fp16)
+void NEMagnitude::configure(const ITensor *input1, const ITensor *input2, ITensor *output, MagnitudeType mag_type)
 {
-    if(use_fp16)
+    if(mag_type == MagnitudeType::L1NORM)
     {
-        if(mag_type == MagnitudeType::L1NORM)
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseFP16Kernel<MagnitudeType::L1NORM, PhaseType::SIGNED>>();
-            k->configure(input1, input2, output, nullptr);
-            _kernel = std::move(k);
-        }
-        else
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
-            k->configure(input1, input2, output, nullptr);
-            _kernel = std::move(k);
-        }
+        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::SIGNED>>();
+        k->configure(input1, input2, output, nullptr);
+        _kernel = std::move(k);
     }
     else
     {
-        if(mag_type == MagnitudeType::L1NORM)
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::SIGNED>>();
-            k->configure(input1, input2, output, nullptr);
-            _kernel = std::move(k);
-        }
-        else
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
-            k->configure(input1, input2, output, nullptr);
-            _kernel = std::move(k);
-        }
+        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+        k->configure(input1, input2, output, nullptr);
+        _kernel = std::move(k);
     }
 }
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index af98ac1..f00114f 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,7 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+    TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type());
     _input_squared.allocator()->init(tensor_info);
 
     // Manage intermediate buffers
diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp
new file mode 100644
index 0000000..995d5ee
--- /dev/null
+++ b/src/runtime/NEON/functions/NERNNLayer.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NERNNLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NERNNLayer::NERNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(),
+      _is_prepared(false)
+{
+}
+
+Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state,
+                            const ITensorInfo *output, const ActivationLayerInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+
+    const int idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const int idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) != recurrent_weights->dimension(idx_width));
+    ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) != recurrent_weights->dimension(idx_height));
+    ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
+    ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
+    ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape());
+
+    auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&shape_info, &shape_info, info));
+
+    return Status{};
+}
+
+void NERNNLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, const ITensor *bias, ITensor *hidden_state, ITensor *output,
+                           ActivationLayerInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+    ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info));
+
+    const int   idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    TensorShape shape      = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+
+    _is_prepared = false;
+
+    // Manage intermediate buffers and configure
+    _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+    _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+
+    // Manage intermediate buffers and configure
+    _memory_group.manage(&_fully_connected_out);
+    _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
+
+    _memory_group.manage(&_gemm_output);
+    _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
+
+    _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+    _memory_group.manage(&_add_output);
+
+    _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
+
+    _fully_connected_out.allocator()->allocate();
+    _gemm_output.allocator()->allocate();
+
+    _activation_kernel.configure(&_add_output, hidden_state, info);
+    _add_output.allocator()->allocate();
+
+    _copy_kernel.configure(hidden_state, output);
+}
+
+void NERNNLayer::run()
+{
+    prepare();
+
+    _memory_group.acquire();
+
+    _fully_connected_kernel.run();
+
+    _gemm_state_f.run();
+
+    NEScheduler::get().schedule(&_add_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_activation_kernel, Window::DimY);
+
+    // copy hidden out to output
+    NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+
+    _memory_group.release();
+}
+
+void NERNNLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        _fully_connected_kernel.prepare();
+        _gemm_state_f.prepare();
+
+        _is_prepared = true;
+    }
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp b/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp
new file mode 100644
index 0000000..a4b0dff
--- /dev/null
+++ b/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h"
+
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NESimpleAssemblyFunction::NESimpleAssemblyFunction() // NOLINT
+    : _kernel()
+{
+}
+
+void NESimpleAssemblyFunction::run()
+{
+    NEScheduler::get().schedule(_kernel.get(), Window::DimX);
+}
+
+void NESimpleAssemblyFunction::configure(std::unique_ptr<INEGEMMWrapperKernel> kernel)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(kernel.get());
+    _kernel = std::move(kernel);
+    ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(_kernel->window(), 1);
+}
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 4fb8300..3a73f1e 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -62,6 +62,7 @@
 {
     // Perform validation step
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 2, "Only 2D inputs are supported");
 
     const TensorShape max_shape           = TensorShape(input->tensor_shape()).set(0, 1);
     const TensorInfo  tensor_info_max_sum = TensorInfo(*input).set_tensor_shape(max_shape).reset_padding();
diff --git a/src/runtime/NEON/functions/NEWarpAffine.cpp b/src/runtime/NEON/functions/NEWarpAffine.cpp
index 889d827..105646c 100644
--- a/src/runtime/NEON/functions/NEWarpAffine.cpp
+++ b/src/runtime/NEON/functions/NEWarpAffine.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,11 +32,10 @@
 
 using namespace arm_compute;
 
-void NEWarpAffine::configure(ITensor *input, ITensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+void NEWarpAffine::configure(ITensor *input, ITensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(nullptr == matrix);
 
     switch(policy)
     {
diff --git a/src/runtime/NEON/functions/NEWarpPerspective.cpp b/src/runtime/NEON/functions/NEWarpPerspective.cpp
index ed5d6a0..80b97ce 100644
--- a/src/runtime/NEON/functions/NEWarpPerspective.cpp
+++ b/src/runtime/NEON/functions/NEWarpPerspective.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,11 +32,10 @@
 
 using namespace arm_compute;
 
-void NEWarpPerspective::configure(ITensor *input, ITensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+void NEWarpPerspective::configure(ITensor *input, ITensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(nullptr == matrix);
 
     switch(policy)
     {
diff --git a/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp b/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
new file mode 100644
index 0000000..097605c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEWidthConcatenateLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+NEWidthConcatenateLayer::NEWidthConcatenateLayer()
+    : _concat_kernels_vector(),
+      _num_inputs(0)
+{
+}
+
+Status NEWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
+
+    // Output auto inizialitation if not yet initialized
+    TensorInfo  tmp_output_info = *output->clone();
+    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+    auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
+
+    unsigned int width_offset = 0;
+    for(const auto &input : inputs_vector)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+        ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info));
+        width_offset += input->dimension(0);
+    }
+
+    return Status{};
+}
+
+void NEWidthConcatenateLayer::configure(std::vector<ITensor *> inputs_vector, ITensor *output)
+{
+    _num_inputs = inputs_vector.size();
+
+    std::vector<ITensorInfo *> inputs_vector_info;
+    for(unsigned int i = 0; i < _num_inputs; i++)
+    {
+        inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
+    }
+    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
+    ARM_COMPUTE_ERROR_THROW_ON(NEWidthConcatenateLayer::validate(inputs_vector_info, output->info()));
+
+    unsigned int width_offset = 0;
+
+    _concat_kernels_vector = arm_compute::support::cpp14::make_unique<NEWidthConcatenateLayerKernel[]>(_num_inputs);
+
+    for(unsigned int i = 0; i < _num_inputs; i++)
+    {
+        _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
+        width_offset += inputs_vector.at(i)->info()->dimension(0);
+    }
+}
+
+void NEWidthConcatenateLayer::run()
+{
+    for(unsigned i = 0; i < _num_inputs; i++)
+    {
+        NEScheduler::get().schedule(_concat_kernels_vector.get() + i, Window::DimY);
+    }
+}
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index 8f2c4c4..828a593 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -24,16 +24,15 @@
 #include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
 
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/AssemblyHelper.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
 #include "support/ToolchainSupport.h"
 
-#include "arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
-
 #include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
 
 namespace arm_compute
@@ -60,7 +59,6 @@
     ARM_COMPUTE_UNUSED(output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW); // COMPMID-1162
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 3 && weights->dimension(height_idx) != 5, "Only 3 and 5 kernels are supported");
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
 
@@ -107,12 +105,13 @@
 
     return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
 }
+
 } //namespace
 
 NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _arm_gemm(nullptr), _gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr),
-      _activationlayer_function(), _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(),
-      _workspace(), _input(), _weights(), _output(), _reshaped_kernel(false), _is_activationlayer_enabled(false)
+    : _memory_group(memory_manager), _asm_glue(memory_manager), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _activationlayer_function(),
+      _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(), _input(), _weights(), _output(),
+      _is_prepared(false), _is_activationlayer_enabled(false)
 {
 } /* arm_compute */
 
@@ -138,9 +137,10 @@
         ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
     }
 
-    _weights = weights;
-    _input   = input;
-    _output  = output;
+    _weights     = weights;
+    _input       = input;
+    _output      = output;
+    _is_prepared = false;
 
     std::unique_ptr<INEWinogradLayerTransformInputKernel<float>>   transform_input_kernel;
     std::unique_ptr<INEWinogradLayerTransformWeightsKernel<float>> transform_weights_kernel;
@@ -155,29 +155,32 @@
         {
             if(input->info()->dimension(width_idx) > 4 && input->info()->dimension(height_idx) > 4)
             {
-                transform_input_kernel   = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>>();
-                transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>>();
-                transform_output_kernel  = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>>();
-                n_gemms                  = NEWinogradLayerBatchedGEMMKernel<float, float, 4, 4, 3, 3>::WinogradBase::N_GEMMS;
-                N_BLOCK                  = NEWinogradLayerBatchedGEMMKernel<float, float, 4, 4, 3, 3>::WinogradConv::N_BLOCK;
+                using config             = NEWinogradLayerConfiguration<float, float, 4, 4, 3, 3>;
+                transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
+                transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+                transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
+                n_gemms                  = config::WinogradBase::N_GEMMS;
+                N_BLOCK                  = config::WinogradConv::N_BLOCK;
             }
             else
             {
-                transform_input_kernel   = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>>();
-                transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>>();
-                transform_output_kernel  = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>>();
-                n_gemms                  = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradBase::N_GEMMS;
-                N_BLOCK                  = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradConv::N_BLOCK;
+                using config             = NEWinogradLayerConfiguration<float, float, 2, 2, 3, 3>;
+                transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
+                transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+                transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
+                n_gemms                  = config::WinogradBase::N_GEMMS;
+                N_BLOCK                  = config::WinogradConv::N_BLOCK;
             }
             break;
         }
         case 5:
         {
-            transform_input_kernel   = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>>();
-            transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>>();
-            transform_output_kernel  = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>>();
-            n_gemms                  = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradBase::N_GEMMS;
-            N_BLOCK                  = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradConv::N_BLOCK;
+            using config             = NEWinogradLayerConfiguration<float, float, 2, 2, 5, 5>;
+            transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
+            transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+            transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
+            n_gemms                  = config::WinogradBase::N_GEMMS;
+            N_BLOCK                  = config::WinogradConv::N_BLOCK;
             break;
         }
         default:
@@ -195,96 +198,138 @@
     const int out_channels = output->info()->dimension(channel_idx);
 
     const Tensor4DShape in_shape(internal_get_input_shape(input));
+    const DataType      data_type      = input->info()->data_type();
     const size_t        data_type_size = input->info()->element_size();
     // Get the memory required to instantiate a new Winograd operator.
-    constexpr size_t storage_alignment   = 64;
-    const size_t     kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size;
-    _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_size + storage_alignment - 1) }, 1, DataType::U8));
-    _kernel_storage.allocator()->allocate();
+    constexpr size_t storage_alignment = 64;
+
+    // Kernel Storage
+    const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels,
+                                                                                         in_channels)
+                                       * data_type_size
+                                       + storage_alignment - 1;
+
     // Input storage
-    const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size;
-    _input_workspace.allocator()->init(TensorInfo(TensorShape{ (input_storage_size + storage_alignment - 1) }, 1, DataType::U8));
-    _input_workspace.allocator()->allocate();
+    const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols,
+                                                                                     use_same_padding)
+                                      * data_type_size
+                                      + storage_alignment - 1;
 
     // Output storage
-    const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, use_same_padding) * data_type_size;
-    _output_workspace.allocator()->init(TensorInfo(TensorShape{ (output_storage_size + storage_alignment - 1) }, 1, DataType::U8));
-    _output_workspace.allocator()->allocate();
+    const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels,
+                                                                                        use_same_padding)
+                                       * data_type_size
+                                       + storage_alignment - 1;
+    ;
+    const KernelShape kernel_shape({ out_channels, static_cast<int>(kernel_size.height), static_cast<int>(kernel_size.width), in_channels });
+    const int         kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
+
+    const int  output_matrix_stride = transform_output_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
+    const auto output_shape(transform_output_kernel->get_output_shape(kernel_shape, in_shape, use_padding_type));
+
+    const int input_matrix_stride = transform_input_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
+
+    // Configure GEMM
+    const int tile_rows                = iceildiv(output_shape.n_rows, output_tile.height);
+    const int tile_cols                = iceildiv(output_shape.n_cols, output_tile.width);
+    const int m                        = in_shape.n_batches * tile_rows * tile_cols;
+    const int k                        = in_shape.n_channels;
+    const int n                        = out_channels;
+    const int kernel_matrix_row_stride = roundup(out_channels, N_BLOCK);
+    const int output_matrix_row_stride = kernel_matrix_row_stride;
+
+    TensorShape a_shape(k, m, 1, n_gemms);
+    Strides     a_strides(data_type_size);
+    a_strides.set(1, a_strides[0] * k);
+    a_strides.set(2, 0);
+    a_strides.set(3, data_type_size * input_matrix_stride);
+
+    TensorShape b_shape(n, k, n_gemms);
+    Strides     b_strides(data_type_size);
+    b_strides.set(1, data_type_size * kernel_matrix_row_stride);
+    b_strides.set(2, data_type_size * kernel_matrix_stride);
+
+    TensorShape d_shape(n, m, 1, n_gemms);
+    Strides     d_strides(data_type_size);
+    d_strides.set(1, data_type_size * output_matrix_row_stride);
+    d_strides.set(2, 0);
+    d_strides.set(3, data_type_size * output_matrix_stride);
+
+    TensorInfo a_info, b_info, d_info;
+    a_info.init(a_shape, 1, data_type, a_strides, 0, input_storage_size);
+    b_info.init(b_shape, 1, data_type, b_strides, 0, kernel_storage_size);
+    d_info.init(d_shape, 1, data_type, d_strides, 0, output_storage_size);
+
+    _input_workspace.allocator()->init(a_info, storage_alignment);
+    _kernel_storage.allocator()->init(b_info, storage_alignment);
+    _output_workspace.allocator()->init(d_info, storage_alignment);
 
     // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
     TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
                                 _output->info()->dimension(1), _output->info()->dimension(3)),
                     1, _output->info()->data_type());
     _output_nhwc.allocator()->init(info);
-    _output_nhwc.allocator()->allocate();
-
-    // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
-    _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
-    _weights_hwio.allocator()->allocate();
-
-    // configure the kernel to transform the input tensor from NCHW -> NHWC
-    _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
-    _input_nhwc.allocator()->allocate();
-
-    const KernelShape kernel_shape({ out_channels, static_cast<int>(kernel_size.height), static_cast<int>(kernel_size.width), in_channels });
 
     // Configure the InputTransform
-    const int input_matrix_stride = transform_input_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
-    transform_input_kernel->configure(reinterpret_cast<float *>(_input_nhwc.buffer()), in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
-                                      reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_stride);
+    _memory_group.manage(&_input_workspace);
+    if(data_layout == DataLayout::NCHW)
+    {
+        // configure the kernel to transform the input tensor from NCHW -> NHWC
+        _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
+        _input_nhwc.allocator()->allocate();
+        transform_input_kernel->configure(&_input_nhwc, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
+                                          &_input_workspace, input_matrix_stride);
+    }
+    else
+    {
+        transform_input_kernel->configure(_input, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
+                                          &_input_workspace, input_matrix_stride);
+    }
 
     // Configure WeightsTransform
-    const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
-    transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast<float *>(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels);
+    if(data_layout == DataLayout::NCHW)
+    {
+        // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
+        _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
+
+        transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
+    }
+    else
+    {
+        // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
+        _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 0U, 1U, 2U));
+
+        transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
+    }
+    _weights_hwio.allocator()->allocate();
 
     // Configure OutputTransform
     //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
-    const int  output_matrix_stride = transform_output_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
-    const auto output_shape(transform_output_kernel->get_output_shape(kernel_shape, in_shape, use_padding_type));
 
-    transform_output_kernel->configure(biases, reinterpret_cast<float *>(_output_workspace.buffer()),
-                                       output_matrix_stride, reinterpret_cast<float *>(_output_nhwc.buffer()),
-                                       in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
-
-    // Configure GEMM
-    const int    tile_rows                = iceildiv(output_shape.n_rows, output_tile.height);
-    const int    tile_cols                = iceildiv(output_shape.n_cols, output_tile.width);
-    const int    m                        = in_shape.n_batches * tile_rows * tile_cols;
-    const int    k                        = in_shape.n_channels;
-    const int    n                        = out_channels;
-    const int    input_matrix_row_stride  = in_shape.n_channels;
-    const int    kernel_matrix_row_stride = roundup(out_channels, N_BLOCK);
-    const int    output_matrix_row_stride = kernel_matrix_row_stride;
-    unsigned int num_threads              = NEScheduler::get().num_threads();
-
-    _arm_gemm = arm_gemm::gemm<float, float>(NEScheduler::get().cpu_info(), m, n, k, 1, n_gemms, false, false, 1.f, 0.f, num_threads, false);
-    _arm_gemm->set_arrays(reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_row_stride, 0, input_matrix_stride, reinterpret_cast<float *>(_kernel_storage.buffer()),
-                          kernel_matrix_row_stride, kernel_matrix_stride, reinterpret_cast<float *>(_output_workspace.buffer()), output_matrix_row_stride, 0, output_matrix_stride);
-
-    auto acl_gemm_wrapper = support::cpp14::make_unique<NEGEMMAssemblyWrapper<arm_gemm::GemmCommon<float, float>>>();
-    acl_gemm_wrapper->configure(_arm_gemm.get());
-    const size_t workspace_size = _arm_gemm->get_working_size();
-
-    // Allocate workspace
-    if(workspace_size > 0)
+    _memory_group.manage(&_output_workspace);
+    if(data_layout == DataLayout::NCHW)
     {
-        const unsigned int alignment = 4096;
-        allocate_workspace(workspace_size, _workspace, &_memory_group, alignment, 1);
-        _arm_gemm->set_working_space(reinterpret_cast<float *>(_workspace.buffer()));
+        transform_output_kernel->configure(biases, &_output_workspace,
+                                           output_matrix_stride, &_output_nhwc,
+                                           in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
+    }
+    else
+    {
+        transform_output_kernel->configure(biases, &_output_workspace,
+                                           output_matrix_stride, _output,
+                                           in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
     }
 
-    const unsigned int window_size = _arm_gemm->get_window_size();
-    if(window_size < num_threads)
-    {
-        num_threads = window_size;
-        _arm_gemm->set_nthreads(num_threads);
-    }
-
-    _gemm_kernel = std::move(acl_gemm_wrapper);
+    _asm_glue.configure(&_input_workspace, &_kernel_storage, &_output_workspace, 1.0f, 0.f, false);
+    _input_workspace.allocator()->allocate();
+    _kernel_storage.allocator()->allocate();
+    _output_workspace.allocator()->allocate();
 
     // Reorder the convoluted output to ACL's ordering NCHW
     _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
 
+    _output_nhwc.allocator()->allocate();
+
     _transform_input_kernel   = std::move(transform_input_kernel);
     _transform_weights_kernel = std::move(transform_weights_kernel);
     _transform_output_kernel  = std::move(transform_output_kernel);
@@ -293,38 +338,43 @@
     _is_activationlayer_enabled = act_info.enabled();
     if(_is_activationlayer_enabled)
     {
-        _activationlayer_function.configure(output, nullptr, act_info);
+        _activationlayer_function.configure(_output, nullptr, act_info);
     }
 }
 
 void NEWinogradConvolutionLayer::run()
 {
-    _memory_group.acquire();
-    if(!_reshaped_kernel)
-    {
-        _reshaped_kernel = true;
-        _permute_weights.run();
-        NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
-    }
-    //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
-    _permute_input.run();
+    const DataLayout data_layout = _input->info()->data_layout();
 
+    prepare();
+
+    _memory_group.acquire();
+
+    if(data_layout == DataLayout::NCHW)
+    {
+        //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
+        _permute_input.run();
+    }
     // Transform input tensor to the winograd domain
     NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX);
 
     //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
-    NEScheduler::get().schedule(_gemm_kernel.get(), Window::DimX);
+    _asm_glue.run();
 
     // Transform output tensor to the spatial domain
     NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
 
-    // Reorder the convoluted output to ACL's ordering NCHW
-    _permute_output.run();
+    if(data_layout == DataLayout::NCHW)
+    {
+        // Reorder the convoluted output to ACL's ordering NCHW
+        _permute_output.run();
+    }
 
     if(_is_activationlayer_enabled)
     {
         _activationlayer_function.run();
     }
+
     _memory_group.release();
 }
 
@@ -358,6 +408,7 @@
     // Validate input transform
     const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
     const TensorInfo  input0       = input->clone()->set_tensor_shape(input0_shape);
+
     switch(weights->dimension(idx_width))
     {
         case 3:
@@ -444,7 +495,6 @@
             break;
         }
     }
-
     // Validate Activation Layer
     if(act_info.enabled())
     {
@@ -453,4 +503,20 @@
     return Status{};
 }
 
+void NEWinogradConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        // Permute weights
+        _permute_weights.run();
+        _weights->mark_as_unused();
+
+        // Transform weights
+        NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
+        _weights_hwio.allocator()->free();
+
+        _is_prepared = true;
+    }
+}
+
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
new file mode 100644
index 0000000..b52ce66
--- /dev/null
+++ b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h"
+#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h"
+#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEGEMMInterleavedWrapper::NEGEMMInterleavedWrapper(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager))
+{
+}
+void NEGEMMInterleavedWrapper::run()
+{
+    prepare();
+
+    _memory_group.acquire();
+    NEScheduler::get().run_workloads(_workloads);
+    _memory_group.release();
+}
+
+void NEGEMMInterleavedWrapper::prepare()
+{
+    if(!_is_prepared)
+    {
+        if(_pretranspose_b)
+        {
+            NEScheduler::get().schedule(_prepare_b.get(), Window::DimX);
+            _b->mark_as_unused();
+        }
+        else
+        {
+            _prepare_b->create_workloads(_b_workloads);
+        }
+        _transform_a->create_workloads(_a_workloads);
+        _matrix_multiply->create_workloads(_mm_workloads);
+
+        //Maximum number of workloads to create:
+        const unsigned int num_threads    = NEScheduler::get().num_threads();
+        const unsigned int max_iterations = num_threads == 1 ? 1 : num_threads;
+        //Maximum number of iterations the parameters allow:
+        const unsigned int num_iterations = _batch_window.num_iterations_total();
+        // Keep the smallest of the two:
+        const unsigned int num_windows  = std::min(num_iterations, max_iterations);
+        const TensorShape  window_shape = _batch_window.shape();
+
+        // Create a 1D window to dynamically split the batch window:
+        Window win_1D;
+        win_1D.set(0, Window::Dimension(0, num_iterations));
+
+        // Create one workload for each sub-window:
+        for(unsigned int w = 0; w < num_windows; w++)
+        {
+            Window             win          = win_1D.split_window(0, w, num_windows);
+            const Coordinates  start_offset = index2coords(window_shape, win.x().start());
+            const Coordinates  end_offset   = index2coords(window_shape, win.x().end() - 1);
+            const unsigned int num_x_blocks = _block_walker.num_iterations(Window::DimX);
+
+            auto workload = [start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
+            {
+                //For each block of rows in "M"
+                auto workload_mm = this->_mm_workloads.begin();
+                for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++)
+                {
+                    // Transform one k_block from A:
+                    this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset);
+                    // Then perform the matrix multiplication for each x block along N:
+                    for(unsigned int i = 0; i < num_x_blocks; i++)
+                    {
+                        ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
+                        this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
+                    }
+                }
+            };
+            _workloads.push_back(workload);
+        }
+
+        _is_prepared = true;
+    }
+}
+
+namespace
+{
+// Factory to instantiate NEGEMMInterleavedPrepareBWrapperKernel:
+template <typename InputType, bool use_dot = false>
+std::unique_ptr<NEGEMMInterleavedPrepareBWrapperKernel> instantiate_prepareB(const ITensor *b, ITensor *transformed_b, const INEGEMMWrapperKernel::Params &params)
+{
+    auto prepare_b = support::cpp14::make_unique<NEGEMMInterleavedPrepareBWrapperKernelTemplate<InputType, use_dot>>();
+    prepare_b->configure(b, transformed_b, false, NEScheduler::get().cpu_info(), params);
+    return std::move(prepare_b);
+}
+
+// Factory to instantiate NEGEMMInterleavedTransformAWrapperTemplate:
+template <typename InputType, bool use_dot = false>
+std::unique_ptr<NEGEMMInterleavedTransformAWrapper> instantiate_transformA(const ITensor *a, ITensor *transformed_a, const Window &block_walker, const INEGEMMWrapperKernel::Params &params)
+{
+    auto transform_a = support::cpp14::make_unique<NEGEMMInterleavedTransformAWrapperTemplate<InputType, use_dot>>();
+    transform_a->configure(a, transformed_a, false, block_walker, params);
+    return std::move(transform_a);
+}
+
+// Factory to instantiate NEGEMMInterleavedTransformAWrapperTemplate:
+template <typename InputType, typename OutputType, bool use_dot = false>
+std::unique_ptr<NEGEMMInterleavedMatrixMultiplyWrapper> instantiate_matrix_multiply(const ITensor *transformed_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &block_walker,
+                                                                                    const BlockSizes &block_sizes, const INEGEMMWrapperKernel::Params &params, bool pretranspose_b, float alpha, float beta)
+{
+    auto matrix_multiply = support::cpp14::make_unique<NEGEMMInterleavedMatrixMultiplyWrapperTemplate<InputType, OutputType, use_dot>>();
+    matrix_multiply->configure(transformed_a, transformed_b, tmp_c, c, block_walker, block_sizes, params, pretranspose_b, alpha, beta, NEScheduler::get().num_threads());
+    return std::move(matrix_multiply);
+}
+} // namespace
+
+void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, bool pretranspose_b, bool use_dot)
+{
+    _params         = INEGEMMWrapperKernel::extract_parameters(a, b, c);
+    _a              = a;
+    _b              = b;
+    _c              = c;
+    _pretranspose_b = pretranspose_b;
+
+    DataType input_type = a->info()->data_type();
+
+    // Forcing 128-byte alignment (required by 32-bit kernels)
+    const unsigned int alignment = 128;
+    _transformed_b.allocator()->init(TensorInfo{}, alignment);
+    _tmp_c.allocator()->init(TensorInfo{}, alignment);
+    if(!_pretranspose_b)
+    {
+        // If B is transposed at every iteration then transformed_B can be managed:
+        _memory_group.manage(&_transformed_b);
+    }
+    switch(input_type)
+    {
+        case DataType::F32:
+            _prepare_b = instantiate_prepareB<float>(_b, &_transformed_b, _params);
+            break;
+#ifdef __aarch64__
+        case DataType::U8:
+        case DataType::QASYMM8:
+            if(use_dot)
+            {
+                _prepare_b = instantiate_prepareB<uint8_t, true>(_b, &_transformed_b, _params);
+            }
+            else
+            {
+                _prepare_b = instantiate_prepareB<uint8_t, false>(_b, &_transformed_b, _params);
+            }
+            break;
+        case DataType::S8:
+            if(use_dot)
+            {
+                _prepare_b = instantiate_prepareB<int8_t, true>(_b, &_transformed_b, _params);
+            }
+            else
+            {
+                _prepare_b = instantiate_prepareB<int8_t, false>(_b, &_transformed_b, _params);
+            }
+            break;
+#endif /* __aarch64__ */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            _prepare_b = instantiate_prepareB<__fp16>(_b, &_transformed_b, _params);
+            break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        default:
+            ARM_COMPUTE_ERROR("DataType not supported");
+            break;
+    }
+    ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr);
+
+    _block_sizes = _prepare_b->block_sizes();
+
+    _block_walker.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_params.N, _block_sizes.x_block), _block_sizes.x_block));
+    _block_walker.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(_params.K, _block_sizes.k_block), _block_sizes.k_block));
+    _block_walker.set(Window::DimZ, Window::Dimension(0, _params.multis));
+
+    _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
+    _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
+
+    _transformed_a.allocator()->init(TensorInfo(TensorShape{ _block_sizes.k_block, _block_sizes.m_round, _params.batches }, 1, input_type), alignment);
+    _memory_group.manage(&_transformed_a);
+    _memory_group.manage(&_tmp_c);
+
+    switch(input_type)
+    {
+        case DataType::F32:
+            _transform_a     = instantiate_transformA<float>(_a, &_transformed_a, _block_walker, _params);
+            _matrix_multiply = instantiate_matrix_multiply<float, float>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
+            break;
+#ifdef __aarch64__
+        case DataType::U8:
+        case DataType::QASYMM8:
+            if(use_dot)
+            {
+                _transform_a     = instantiate_transformA<uint8_t, true>(_a, &_transformed_a, _block_walker, _params);
+                _matrix_multiply = instantiate_matrix_multiply<uint8_t, uint32_t, true>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
+            }
+            else
+            {
+                _transform_a     = instantiate_transformA<uint8_t, false>(_a, &_transformed_a, _block_walker, _params);
+                _matrix_multiply = instantiate_matrix_multiply<uint8_t, uint32_t, false>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
+            }
+            break;
+        case DataType::S8:
+            if(use_dot)
+            {
+                _transform_a     = instantiate_transformA<int8_t, true>(_a, &_transformed_a, _block_walker, _params);
+                _matrix_multiply = instantiate_matrix_multiply<int8_t, int32_t, true>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
+            }
+            else
+            {
+                _transform_a     = instantiate_transformA<int8_t, false>(_a, &_transformed_a, _block_walker, _params);
+                _matrix_multiply = instantiate_matrix_multiply<int8_t, int32_t, false>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
+            }
+            break;
+#endif /* __aarch64__ */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            _transform_a     = instantiate_transformA<__fp16>(_a, &_transformed_a, _block_walker, _params);
+            _matrix_multiply = instantiate_matrix_multiply<__fp16, __fp16>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
+            break;
+            break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        default:
+            break;
+    }
+    ARM_COMPUTE_ERROR_ON(_transform_a == nullptr);
+    ARM_COMPUTE_ERROR_ON(_matrix_multiply == nullptr);
+    _transformed_a.allocator()->allocate();
+    _tmp_c.allocator()->allocate();
+    _transformed_b.allocator()->allocate();
+}
+} // namespace arm_compute
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index 795c96c..f4253c8 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -56,29 +56,55 @@
     _num_threads                 = (num_threads == 0) ? num_cores : num_threads;
 }
 
-void OMPScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
+void OMPScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
-
-    ThreadInfo info;
-    info.cpu_info = &_cpu_info;
+    ARM_COMPUTE_ERROR_ON_MSG(hints.strategy() == StrategyHint::DYNAMIC,
+                             "Dynamic scheduling is not supported in OMPScheduler");
 
     const Window      &max_window     = kernel->window();
-    const unsigned int num_iterations = max_window.num_iterations(split_dimension);
-    info.num_threads                  = std::min(num_iterations, _num_threads);
+    const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
+    const unsigned int num_threads    = std::min(num_iterations, _num_threads);
 
-    if(!kernel->is_parallelisable() || info.num_threads == 1)
+    if(!kernel->is_parallelisable() || num_threads == 1)
     {
+        ThreadInfo info;
+        info.cpu_info = &_cpu_info;
         kernel->run(max_window, info);
     }
     else
     {
-        #pragma omp parallel firstprivate(info) num_threads(info.num_threads)
+        const unsigned int                num_windows = num_threads;
+        std::vector<IScheduler::Workload> workloads(num_windows);
+        for(unsigned int t = 0; t < num_windows; t++)
         {
-            const int tid  = omp_get_thread_num();
-            Window    win  = max_window.split_window(split_dimension, tid, info.num_threads);
-            info.thread_id = tid;
-            kernel->run(win, info);
+            //Capture 't' by copy, all the other variables by reference:
+            workloads[t] = [t, &hints, &max_window, &num_windows, &kernel](const ThreadInfo & info)
+            {
+                Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
+                win.validate();
+                kernel->run(win, info);
+            };
         }
+        run_workloads(workloads);
+    }
+}
+
+void OMPScheduler::run_workloads(std::vector<arm_compute::IScheduler::Workload> &workloads)
+{
+    const unsigned int num_threads = std::min(_num_threads, static_cast<unsigned int>(workloads.size()));
+    if(num_threads < 1)
+    {
+        return;
+    }
+
+    ThreadInfo info;
+    info.cpu_info    = &_cpu_info;
+    info.num_threads = num_threads;
+    #pragma omp parallel firstprivate(info) num_threads(num_threads)
+    {
+        const int tid  = omp_get_thread_num();
+        info.thread_id = tid;
+        workloads[tid](info);
     }
 }
diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index 993a95b..c84a271 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp
@@ -138,7 +138,7 @@
 
     if(_associated_memory_group == nullptr)
     {
-        _memory = Memory(std::make_shared<MemoryRegion>(info().total_size()));
+        _memory = Memory(std::make_shared<MemoryRegion>(info().total_size(), alignment()));
     }
     else
     {