arm_compute v19.11
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index 4aeb3a1..00dbb71 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,13 +25,21 @@
 
 #include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLRuntimeContext.h"
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx)
+    : ICLSimpleFunction(ctx)
+{
+}
 
 void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerKernel>();
+    auto core_ctx = _ctx ? _ctx->core_runtime_context() : /* Legacy */ nullptr;
+
+    auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerKernel>(core_ctx);
     k->configure(input, output, act_info);
     _kernel = std::move(k);
 }
@@ -40,3 +48,4 @@
 {
     return CLActivationLayerKernel::validate(input, output, act_info);
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
index a6393c5..fd172d5 100644
--- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,26 +23,33 @@
  */
 
 #include "arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h"
+#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
 
-#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
 
 namespace arm_compute
 {
-void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
+CLArgMinMaxLayer::CLArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _reduction_function(support::cpp14::make_unique<CLReductionOperation>(std::move(memory_manager)))
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel>();
-    k->configure(input, output, axis, op);
-    _kernel = std::move(k);
+}
+
+void CLArgMinMaxLayer::configure(ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
+{
+    _reduction_function->configure(input, output, axis, op, false);
 }
 
 Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
-    return CLReductionOperationKernel::validate(input, output, axis, op);
+    return CLReductionOperation::validate(input, output, axis, op, false);
+}
+
+void CLArgMinMaxLayer::run()
+{
+    _reduction_function->run();
 }
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
index 4c7458d..dbaea81 100644
--- a/src/runtime/CL/functions/CLCannyEdge.cpp
+++ b/src/runtime/CL/functions/CLCannyEdge.cpp
@@ -128,7 +128,7 @@
     }
     else
     {
-        ARM_COMPUTE_ERROR("Gradient size %d not supported", gradient_size);
+        ARM_COMPUTE_ERROR_VAR("Gradient size %d not supported", gradient_size);
     }
 
     // Manage intermediate buffers
diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp
index b22809e..5e1278d 100644
--- a/src/runtime/CL/functions/CLCropResize.cpp
+++ b/src/runtime/CL/functions/CLCropResize.cpp
@@ -48,7 +48,7 @@
                         std::floor(y0 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
     end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
                       std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
-    const TensorShape out_shape(input->info()->tensor_shape()[0], abs(end[0] - start[0]) + 1, abs(end[1] - start[1]) + 1);
+    const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1, static_cast<uint32_t>(abs(end[1] - start[1])) + 1);
     output->info()->set_tensor_shape(out_shape);
 }
 
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
index 63a45aa..eaf7c66 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index f01b58a..e717f79 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -38,41 +38,386 @@
 using namespace arm_compute::misc;
 using namespace arm_compute::misc::shape_calculator;
 
+namespace
+{
+Status validate_arguments_3x3(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                              unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation)
+{
+    // This function should be removed and incorporated inside CLDepthwiseConvolutionLayerInternal3x3 once CLDepthwiseConvolutionLayer3x3 is properly removed
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+
+    const bool                      is_quantized           = is_data_type_quantized_asymmetric(input->data_type());
+    const bool                      is_nhwc                = input->data_layout() == DataLayout::NHWC;
+    const bool                      needs_permute          = is_nhwc && (depth_multiplier > 1);
+    const bool                      needs_weights_reshape  = is_nhwc && (depth_multiplier == 1) && is_quantized;
+    const bool                      is_stride_1            = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+    const bool                      is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
+    const bool                      is_dot8_supported      = dot8_supported(CLKernelLibrary::get().get_device());
+    DepthwiseConvolutionReshapeInfo info;
+    info.c0        = 4;
+    info.transpose = is_stride_1_dilation_1 && is_dot8_supported;
+
+    TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1, DataType::S32));
+    if(is_quantized)
+    {
+        if(is_data_type_quantized_per_channel(weights->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
+
+            const size_t idx_c = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
+            output_multipliers_shifts_info.set_tensor_shape(TensorShape(weights->dimension(idx_c)));
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+        }
+    }
+
+    if(needs_permute)
+    {
+        TensorShape permuted_input_shape   = input->tensor_shape();
+        TensorShape permuted_weights_shape = weights->tensor_shape();
+        TensorShape permuted_output_shape  = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+
+        permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
+        permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
+        permute(permuted_output_shape, PermutationVector(1U, 2U, 0U));
+
+        const TensorInfo permuted_input   = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NCHW);
+        const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW);
+        const TensorInfo permuted_output  = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW);
+
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output,
+                                                                                       conv_info, depth_multiplier, act_info, gpu_target,
+                                                                                       dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
+    }
+    else if(is_nhwc)
+    {
+        if(needs_weights_reshape)
+        {
+            auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*weights, info);
+            ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases,
+                                                                                           output, conv_info, depth_multiplier, act_info,
+                                                                                           dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info,
+                                                                                           dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target,
+                                                                                       dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
+    }
+    return Status{};
+}
+} // namespace
+
 CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _kernel(nullptr), _border_handler(), _permute_input_to_nchw(), _permute_weights_to_nchw(), _permute_output_to_nhwc(), _reshape_weights(), _permuted_input(),
-      _permuted_weights(), _permuted_output(), _original_weights(nullptr), _needs_permute(false), _needs_weights_reshape(false), _is_prepared(false)
+    : _func(std::move(memory_manager))
 {
 }
 
 void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
                                                ActivationLayerInfo act_info, const Size2D &dilation)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    // idx_w and idx_h only used for validation
-    const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
-    ARM_COMPUTE_UNUSED(idx_w);
-    ARM_COMPUTE_UNUSED(idx_h);
+    _func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+}
 
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                                unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation)
+{
+    return validate_arguments_3x3(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation);
+}
 
-    const bool is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
+void CLDepthwiseConvolutionLayer3x3::run()
+{
+    _func.run();
+}
 
-    _needs_permute         = is_nhwc && (depth_multiplier > 1);
-    _needs_weights_reshape = is_nhwc && (depth_multiplier == 1)
-                             && is_data_type_quantized_asymmetric(input->info()->data_type());
+void CLDepthwiseConvolutionLayer3x3::prepare()
+{
+    _func.prepare();
+}
+
+CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::CLDepthwiseConvolutionLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)),
+      _dwc_native_kernel(),
+      _permute_input_to_nhwc(),
+      _permute_weights_to_nhwc(),
+      _permute_output_to_nchw(),
+      _permuted_input(),
+      _permuted_weights(),
+      _permuted_output(),
+      _output_multipliers(),
+      _output_shifts(),
+      _original_weights(),
+      _input(),
+      _output(),
+      _needs_permute(false),
+      _is_prepared(false),
+      _is_quantized(false)
+{
+}
+
+void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+                                                                                unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(input->info(),
+                                                                     weights->info(),
+                                                                     biases != nullptr ? biases->info() : nullptr,
+                                                                     output->info(),
+                                                                     conv_info,
+                                                                     depth_multiplier,
+                                                                     act_info,
+                                                                     dilation));
+
+    _is_quantized     = is_data_type_quantized(input->info()->data_type());
     _is_prepared      = false;
     _original_weights = weights;
+    _input            = input;
+    _output           = output;
+    _needs_permute    = input->info()->data_layout() == DataLayout::NCHW;
+
+    ICLTensor       *input_to_use   = input;
+    const ICLTensor *weights_to_use = weights;
+    ICLTensor       *output_to_use  = output;
+    if(_needs_permute)
+    {
+        _memory_group.manage(&_permuted_input);
+        _memory_group.manage(&_permuted_output);
+
+        // Configure the function to transform the input tensor from NCHW -> NHWC
+        _permute_input_to_nhwc.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+        _permuted_input.info()->set_data_layout(DataLayout::NHWC);
+
+        // Configure the function to transform the weights tensor from IHW -> HWI
+        _permute_weights_to_nhwc.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+        _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+
+        // Set output quantization info before dwc kernel configure
+        _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
+
+        input_to_use   = &_permuted_input;
+        weights_to_use = &_permuted_weights;
+        output_to_use  = &_permuted_output;
+    }
+
+    CLTensor *output_multipliers_to_use = nullptr;
+    CLTensor *output_shifts_to_use      = nullptr;
+    if(_is_quantized)
+    {
+        const size_t idx_c       = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t num_filters = (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1;
+
+        _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
+        _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
+
+        output_multipliers_to_use = &_output_multipliers;
+        output_shifts_to_use      = &_output_shifts;
+    }
+
+    DWCWeightsKernelInfo dwc_weights_info;
+    dwc_weights_info.n0 = (depth_multiplier == 1) ? 8 : 1;
+    DWCKernelInfo dwc_info;
+    dwc_info.activation_info = act_info;
+    _dwc_native_kernel.configure(input_to_use, weights_to_use, biases, output_to_use,
+                                 dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation,
+                                 output_multipliers_to_use, output_shifts_to_use);
+
+    if(_needs_permute)
+    {
+        _permuted_input.allocator()->allocate();
+
+        // Configure the function to transform the convoluted output to NCHW format
+        _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+        _permute_output_to_nchw.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
+        _permuted_output.allocator()->allocate();
+    }
+
+    if(_is_quantized)
+    {
+        _output_multipliers.allocator()->allocate();
+        _output_shifts.allocator()->allocate();
+    }
+}
+
+Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+                                                                                 const PadStrideInfo &conv_info,
+                                                                                 unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+
+    DWCWeightsKernelInfo dwc_weights_info;
+    dwc_weights_info.n0 = (depth_multiplier == 1) ? 8 : 1;
+    DWCKernelInfo dwc_info;
+    dwc_info.activation_info = act_info;
+
+    const bool needs_permute = input->data_layout() == DataLayout::NCHW;
+
+    const bool is_quantized = is_data_type_quantized(input->data_type());
+
+    TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1, DataType::S32));
+    if(is_quantized)
+    {
+        if(is_data_type_quantized_per_channel(weights->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
+
+            const size_t idx_c = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
+            output_multipliers_shifts_info.set_tensor_shape(TensorShape(weights->dimension(idx_c)));
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+        }
+    }
+
+    if(needs_permute)
+    {
+        TensorShape permuted_input_shape   = input->tensor_shape();
+        TensorShape permuted_weights_shape = weights->tensor_shape();
+        TensorShape permuted_output_shape  = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+
+        permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
+        permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
+        permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
+
+        const TensorInfo permuted_input   = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC);
+        const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC);
+        const TensorInfo permuted_output  = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NHWC);
+
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, dwc_weights_info,
+                                                                                      dwc_info, conv_info, depth_multiplier, dilation,
+                                                                                      &output_multipliers_shifts_info, &output_multipliers_shifts_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U)));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier,
+                                                                                      dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
+    }
+    return Status{};
+}
+
+void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::run()
+{
+    prepare();
+
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    if(_needs_permute)
+    {
+        _permute_input_to_nhwc.run();
+    }
+    CLScheduler::get().enqueue(_dwc_native_kernel);
+    if(_needs_permute)
+    {
+        _permute_output_to_nchw.run();
+    }
+}
+
+void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare()
+{
+    if(!_is_prepared)
+    {
+        if(_is_quantized)
+        {
+            _output_multipliers.map();
+            _output_shifts.map();
+            const unsigned int idx_ofms = get_data_layout_dimension_index(_output->info()->data_layout(), DataLayoutDimension::CHANNEL);
+            quantization::compute_quantized_multipliers_and_shifts(_input->info(),
+                                                                   _original_weights->info(),
+                                                                   _output->info(),
+                                                                   idx_ofms,
+                                                                   reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
+                                                                   reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
+            _output_multipliers.unmap();
+            _output_shifts.unmap();
+        }
+
+        if(_needs_permute)
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+            _permuted_weights.allocator()->allocate();
+            _permute_weights_to_nhwc.run();
+            _original_weights->mark_as_unused();
+        }
+        _is_prepared = true;
+    }
+}
+
+CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::CLDepthwiseConvolutionLayerInternal3x3(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)),
+      _kernel(nullptr),
+      _border_handler(),
+      _permute_input_to_nchw(),
+      _permute_weights_to_nchw(),
+      _permute_output_to_nhwc(),
+      _reshape_weights(),
+      _permuted_input(),
+      _permuted_weights(),
+      _permuted_output(),
+      _output_multipliers(),
+      _output_shifts(),
+      _original_weights(nullptr),
+      _input(nullptr),
+      _output(nullptr),
+      _needs_permute(false),
+      _needs_weights_reshape(false),
+      _is_prepared(false),
+      _is_quantized(false)
+{
+}
+
+void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+                                                                                    const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
+{
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer3x3::validate(input->info(),
+                                                                        weights->info(),
+                                                                        biases != nullptr ? biases->info() : nullptr,
+                                                                        output->info(),
+                                                                        conv_info,
+                                                                        depth_multiplier,
+                                                                        act_info,
+                                                                        gpu_target,
+                                                                        dilation));
+
+    const bool is_nhwc     = input->info()->data_layout() == DataLayout::NHWC;
+    _is_quantized          = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _needs_permute         = is_nhwc && (depth_multiplier > 1);
+    _needs_weights_reshape = is_nhwc && (depth_multiplier == 1) && _is_quantized;
+
+    _is_prepared      = false;
+    _original_weights = weights;
+    _input            = input;
+    _output           = output;
 
     ICLTensor       *input_to_use   = input;
     const ICLTensor *weights_to_use = weights;
     ICLTensor       *output_to_use  = output;
 
-    const bool is_stride_1            = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
-    const bool is_dot8_supported      = dot8_supported(CLKernelLibrary::get().get_device());
-    const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
+    const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type());
+    const bool is_stride_1              = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+    const bool is_dot8_supported        = dot8_supported(CLKernelLibrary::get().get_device()) && !is_quantized_per_channel;
+    const bool is_stride_1_dilation_1   = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
 
     DepthwiseConvolutionReshapeInfo info;
     info.c0        = 4;
@@ -112,9 +457,30 @@
         _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
     }
 
+    CLTensor *output_multipliers_to_use = nullptr;
+    CLTensor *output_shifts_to_use      = nullptr;
+    if(_is_quantized)
+    {
+        const size_t idx_c       = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t num_filters = (is_quantized_per_channel) ? weights->info()->dimension(idx_c) : 1;
+
+        _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
+        _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
+
+        output_multipliers_to_use = &_output_multipliers;
+        output_shifts_to_use      = &_output_shifts;
+    }
+
     // Configure kernel
-    _kernel->set_target(CLScheduler::get().target());
-    _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, act_info, dilation);
+    _kernel->set_target(gpu_target);
+    _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier,
+                       act_info, dilation, output_multipliers_to_use, output_shifts_to_use);
+
+    if(_is_quantized)
+    {
+        _output_multipliers.allocator()->allocate();
+        _output_shifts.allocator()->allocate();
+    }
 
     // Permute output if needed
     if(_needs_permute)
@@ -136,73 +502,13 @@
     _border_handler.configure(input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
 }
 
-Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation)
+Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+                                                                                     const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-
-    const bool                      is_quantized           = is_data_type_quantized_asymmetric(input->data_type());
-    const bool                      is_nhwc                = input->data_layout() == DataLayout::NHWC;
-    const bool                      needs_permute          = is_nhwc && (depth_multiplier > 1);
-    const bool                      needs_weights_reshape  = is_nhwc && (depth_multiplier == 1) && is_quantized;
-    const bool                      is_stride_1            = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
-    const bool                      is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
-    const bool                      is_dot8_supported      = dot8_supported(CLKernelLibrary::get().get_device());
-    DepthwiseConvolutionReshapeInfo info;
-    info.c0        = 4;
-    info.transpose = is_stride_1_dilation_1 && is_dot8_supported;
-
-    if(is_quantized)
-    {
-        const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
-        const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = (output->total_size() == 0) ? iq_info : output->quantization_info().uniform();
-
-        const float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
-        ARM_COMPUTE_UNUSED(multiplier);
-        ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f);
-    }
-
-    if(needs_permute)
-    {
-        TensorShape permuted_input_shape   = input->tensor_shape();
-        TensorShape permuted_weights_shape = weights->tensor_shape();
-        TensorShape permuted_output_shape  = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
-
-        permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
-        permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
-        permute(permuted_output_shape, PermutationVector(1U, 2U, 0U));
-
-        const TensorInfo permuted_input   = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NCHW);
-        const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW);
-        const TensorInfo permuted_output  = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW);
-
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, act_info, gpu_target,
-                                                                                       dilation));
-    }
-    else if(is_nhwc)
-    {
-        if(needs_weights_reshape)
-        {
-            auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*weights, info);
-            ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases, output, conv_info, depth_multiplier,
-                                                                                           act_info, dilation));
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
-        }
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation));
-    }
-
-    return Status{};
+    return validate_arguments_3x3(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation);
 }
 
-void CLDepthwiseConvolutionLayer3x3::run()
+void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::run()
 {
     prepare();
 
@@ -221,10 +527,25 @@
     }
 }
 
-void CLDepthwiseConvolutionLayer3x3::prepare()
+void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::prepare()
 {
     if(!_is_prepared)
     {
+        if(_is_quantized)
+        {
+            _output_multipliers.map();
+            _output_shifts.map();
+            const unsigned int idx_ofms = get_data_layout_dimension_index(_output->info()->data_layout(), DataLayoutDimension::CHANNEL);
+            quantization::compute_quantized_multipliers_and_shifts(_input->info(),
+                                                                   _original_weights->info(),
+                                                                   _output->info(),
+                                                                   idx_ofms,
+                                                                   reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
+                                                                   reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
+            _output_multipliers.unmap();
+            _output_shifts.unmap();
+        }
+
         if(_needs_permute)
         {
             ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
@@ -246,259 +567,92 @@
     }
 }
 
-CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer()
-    : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _activationlayer_function(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(),
-      _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _is_activationlayer_enabled(false), _original_weights(nullptr),
-      _optimised_function(nullptr)
+CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_manager(std::move(memory_manager)), _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_3x3(), _func_generic()
 {
 }
 
-void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                            unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+                                            ActivationLayerInfo act_info, const Size2D &dilation)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-
-    const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
-
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
-
-    const bool can_run_optimised_3x3_kernel = (weights->info()->dimension(idx_w) == 3) && (weights->info()->dimension(idx_h) == 3);
-
-    if(bool(can_run_optimised_3x3_kernel))
+    const GPUTarget gpu_target = CLScheduler::get().target();
+    _depth_conv_func           = get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info,
+                                                                   dilation, gpu_target);
+    switch(_depth_conv_func)
     {
-        auto f = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3>();
-        f->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-        _optimised_function = std::move(f);
-    }
-    else
-    {
-        const size_t idx_c = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
-
-        const size_t weights_w = weights->info()->dimension(idx_w);
-        const size_t weights_h = weights->info()->dimension(idx_h);
-        const size_t weights_z = weights->info()->dimension(idx_c);
-
-        _is_prepared      = false;
-        _original_weights = weights;
-        _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
-
-        bool            append_bias = (biases != nullptr) && !_is_quantized;
-        const GPUTarget gpu_target  = CLScheduler::get().target();
-
-        // Calculate output shape
-        TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
-
-        // Output auto inizialitation if not yet initialized
-        auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-
-        // Output width and height
-        const unsigned int conv_w = output_shape[idx_w];
-        const unsigned int conv_h = output_shape[idx_h];
-
-        // Set up intermediate tensors
-        const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
-        const size_t conv_size  = conv_w * conv_h;
-
-        const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
-        const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
-
-        // Im2Col configuration
-        TensorShape shape_im2col = input->info()->tensor_shape();
-        shape_im2col.set(0, patch_size);
-        shape_im2col.set(1, conv_size);
-        shape_im2col.set(2, weights_z);
-        _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
-        _im2col_kernel.set_target(gpu_target);
-        _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation);
-        CLScheduler::get().tune_kernel_static(_im2col_kernel);
-
-        // Weights reshape configuration
-        const TensorShape shape_weights_reshape(patch_size, weights_z);
-        _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
-        _weights_reshape_kernel.configure(weights, &_weights_reshaped, append_bias ? biases : nullptr);
-
-        // GEMV configuration
-        DataType    v2mm_dt        = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
-        TensorShape shape_v2mm_out = input->info()->tensor_shape();
-        shape_v2mm_out.set(0, conv_size * weights_z);
-        shape_v2mm_out.set(1, 1);
-        shape_v2mm_out.set(2, 1);
-        _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
-        _v2mm_kernel.set_target(gpu_target);
-        _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
-        CLScheduler::get().tune_kernel_static(_v2mm_kernel);
-        _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
-        _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
-
-        // Output staged configuration
-        if(_is_quantized)
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            _func_3x3.set_memory_group(_memory_manager);
+            _func_3x3.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+            break;
+        case DepthwiseConvolutionFunction::GENERIC:
         {
-            const UniformQuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? iq_info : oq_info;
-
-            int         output_multiplier = 0;
-            int         output_shift      = 0;
-            const float multiplier        = iq_info.scale * wq_info.scale / output_quant_info.scale;
-            quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-            _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
-            _output_reshaped.allocator()->allocate();
+            _func_generic.set_memory_group(_memory_manager);
+            _func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
         }
-
-        // Fill borders on inputs
-        PixelValue zero_in(static_cast<int32_t>(0));
-        PixelValue zero_w(static_cast<int32_t>(0));
-        if(_is_quantized)
-        {
-            zero_in = PixelValue(static_cast<int32_t>(iq_info.offset));
-            zero_w  = PixelValue(static_cast<int32_t>(wq_info.offset));
-        }
-        BorderSize border_size = _v2mm_kernel.border_size();
-        _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, zero_in);
-
-        border_size.bottom = 0;
-        _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, zero_w);
-
-        // Allocate intermediate tensors
-        _input_reshaped.allocator()->allocate();
-        _v2mm_output.allocator()->allocate();
-
-        //Configure Activation Layer
-        _is_activationlayer_enabled = act_info.enabled();
-
-        if(_is_activationlayer_enabled)
-        {
-            _activationlayer_function.configure(output, nullptr, act_info);
-        }
+        break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
     }
 }
 
 Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                             unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+                                             unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
 {
-    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
-
-    const bool can_run_optimised_3x3_kernel = (weights->dimension(idx_w) == 3) && (weights->dimension(idx_h) == 3);
-
-    if(!can_run_optimised_3x3_kernel)
+    const GPUTarget              gpu_target      = CLScheduler::get().target();
+    DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, gpu_target);
+    switch(depth_conv_func)
     {
-        const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            return CLDepthwiseConvolutionLayerInternal3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation);
+        case DepthwiseConvolutionFunction::GENERIC:
+            return CLDepthwiseConvolutionLayerGeneric::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+        default:
+            ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
+    }
+}
 
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-        ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(idx_c) * depth_multiplier) != weights->dimension(idx_c));
-
-        const bool         is_quantized = is_data_type_quantized_asymmetric(input->data_type());
-        const bool         append_bias  = (biases != nullptr) && !is_quantized;
-        const TensorShape  output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
-        const size_t       weights_w    = weights->dimension(idx_w);
-        const size_t       weights_h    = weights->dimension(idx_h);
-        const size_t       weights_z    = weights->dimension(idx_c);
-        const unsigned int conv_w       = output_shape[idx_w];
-        const unsigned int conv_h       = output_shape[idx_h];
-        const size_t       patch_size   = weights_w * weights_h + ((append_bias) ? 1 : 0);
-        const size_t       conv_size    = conv_w * conv_h;
-
-        TensorShape shape_im2col = input->tensor_shape();
-        shape_im2col.set(0, patch_size);
-        shape_im2col.set(1, conv_size);
-        shape_im2col.set(2, weights_z);
-        TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation));
-
-        const TensorShape shape_weights_reshape(patch_size, weights_z);
-        TensorInfo        weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel::validate(weights, &weights_reshaped, append_bias ? biases : nullptr));
-
-        DataType    v2mm_dt        = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
-        TensorShape shape_v2mm_out = input->tensor_shape();
-        shape_v2mm_out.set(0, conv_size * weights_z);
-        shape_v2mm_out.set(1, 1);
-        shape_v2mm_out.set(2, 1);
-        TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
-
-        TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output, conv_w, conv_h));
-
-        if(is_quantized)
-        {
-            const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
-            const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
-            const UniformQuantizationInfo oq_info = (output->total_size() == 0) ? iq_info : output->quantization_info().uniform();
-
-            const float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
-            ARM_COMPUTE_UNUSED(multiplier);
-            ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f);
-            ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
-        }
-
-        // Validate Activation Layer
-        if(act_info.enabled())
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
-        }
+DepthwiseConvolutionFunction CLDepthwiseConvolutionLayer::get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+                                                                                            const PadStrideInfo &conv_info,
+                                                                                            unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation, GPUTarget gpu_target)
+{
+    if(bool(CLDepthwiseConvolutionLayerInternal3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation)) && (is_data_type_float(input->data_type())
+            || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD))
+    {
+        return DepthwiseConvolutionFunction::OPTIMIZED;
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, GPUTarget::MIDGARD, dilation));
+        return DepthwiseConvolutionFunction::GENERIC;
     }
-    return Status{};
 }
 
 void CLDepthwiseConvolutionLayer::run()
 {
-    prepare();
-
-    if(_optimised_function != nullptr)
+    switch(_depth_conv_func)
     {
-        _optimised_function->run();
-    }
-    else
-    {
-        CLScheduler::get().enqueue(_im2col_kernel);
-        CLScheduler::get().enqueue(_v2mm_input_fill_border);
-        CLScheduler::get().enqueue(_v2mm_kernel);
-        CLScheduler::get().enqueue(_vector_to_tensor_kernel);
-        if(_is_quantized)
-        {
-            CLScheduler::get().enqueue(_output_stage_kernel);
-        }
-        if(_is_activationlayer_enabled)
-        {
-            _activationlayer_function.run();
-        }
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            _func_3x3.run();
+            break;
+        case DepthwiseConvolutionFunction::GENERIC:
+            _func_generic.run();
+            break;
+        default:
+            ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
     }
 }
 
 void CLDepthwiseConvolutionLayer::prepare()
 {
-    if(_optimised_function != nullptr)
+    switch(_depth_conv_func)
     {
-        _optimised_function->prepare();
-    }
-    else
-    {
-        if(!_is_prepared)
-        {
-            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-            // Run weights reshaping and mark original weights tensor as unused
-            _weights_reshaped.allocator()->allocate();
-            CLScheduler::get().enqueue(_weights_reshape_kernel);
-            CLScheduler::get().enqueue(_v2mm_weights_fill_border);
-            _original_weights->mark_as_unused();
-
-            CLScheduler::get().queue().finish();
-            _is_prepared = true;
-        }
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            _func_3x3.prepare();
+            break;
+        case DepthwiseConvolutionFunction::GENERIC:
+            _func_generic.prepare();
+            break;
+        default:
+            ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
deleted file mode 100644
index fa2c3af..0000000
--- a/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-CLDepthwiseSeparableConvolutionLayer::CLDepthwiseSeparableConvolutionLayer()
-    : _depthwise_conv(), _pointwise_conv()
-{
-}
-
-void CLDepthwiseSeparableConvolutionLayer::configure(ICLTensor *input, const ICLTensor *depthwise_weights, const ICLTensor *depthwise_biases, ICLTensor *depthwise_out,
-                                                     const ICLTensor *pointwise_weights, const ICLTensor *pointwise_biases, ICLTensor *output,
-                                                     const PadStrideInfo &depthwise_conv_info, const PadStrideInfo &pointwise_conv_info)
-{
-    _depthwise_conv.configure(input, depthwise_weights, depthwise_biases, depthwise_out, depthwise_conv_info);
-    _pointwise_conv.configure(depthwise_out, pointwise_weights, pointwise_biases, output, pointwise_conv_info);
-}
-
-void CLDepthwiseSeparableConvolutionLayer::run()
-{
-    prepare();
-
-    _depthwise_conv.run();
-    _pointwise_conv.run();
-}
-
-void CLDepthwiseSeparableConvolutionLayer::prepare()
-{
-    _depthwise_conv.prepare();
-    _pointwise_conv.prepare();
-}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
index c1a39ef..b8089d8 100644
--- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
@@ -63,13 +63,8 @@
 
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
-    ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
 
-    const unsigned int stride_x = info.stride().first;
-    const unsigned int stride_y = info.stride().second;
-
-    auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h),
-                                                    info.pad().first, info.pad().second, stride_x, stride_y);
+    auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), info);
 
     const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
 
@@ -92,9 +87,11 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], "Output's height is invalid.");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], "Output's depth is invalid.");
 
-    unsigned int        padx            = 0;
-    unsigned int        pady            = 0;
-    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, padx, pady);
+    unsigned int        deconv_pad_x = 0;
+    unsigned int        deconv_pad_y = 0;
+    const unsigned int  stride_x = info.stride().first;
+    const unsigned int  stride_y = info.stride().second;
+    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
     TensorInfo          scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
@@ -109,6 +106,10 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
+    const unsigned int pad_left   = info.pad_left();
+    const unsigned int pad_right  = info.pad_right();
+    const unsigned int pad_top    = info.pad_top();
+    const unsigned int pad_bottom = info.pad_bottom();
     const unsigned int stride_x = info.stride().first;
     const unsigned int stride_y = info.stride().second;
 
@@ -122,8 +123,7 @@
     _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
     _flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
 
-    auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h),
-                                                    info.pad().first, info.pad().second, stride_x, stride_y);
+    auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info);
 
     const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
 
@@ -138,16 +138,30 @@
     _memory_group.manage(&_scaled_output);
 
     // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
-    unsigned int      padx            = 0;
-    unsigned int      pady            = 0;
-    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, out_dims, padx, pady);
+    unsigned int      deconv_pad_x    = 0;
+    unsigned int      deconv_pad_y    = 0;
+    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
+
+    unsigned int deconv_pad_left  = pad_right > pad_left ? pad_right - pad_left : 0;
+    unsigned int deconv_pad_right = pad_left > pad_right ? pad_left - pad_right : 0;
+    deconv_pad_x -= deconv_pad_left + deconv_pad_right;
+    ARM_COMPUTE_ERROR_ON((deconv_pad_x % 2) != 0);
+    deconv_pad_left  += deconv_pad_x / 2;
+    deconv_pad_right += deconv_pad_x / 2;
+
+    unsigned int deconv_pad_top    = pad_bottom > pad_top ? pad_bottom - pad_top : 0;
+    unsigned int deconv_pad_bottom = pad_top > pad_bottom ? pad_top - pad_bottom : 0;
+    deconv_pad_y -= deconv_pad_top + deconv_pad_bottom;
+    ARM_COMPUTE_ERROR_ON((deconv_pad_y % 2) != 0);
+    deconv_pad_top    += deconv_pad_y / 2;
+    deconv_pad_bottom += deconv_pad_y / 2;
 
     TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
     scale_out_info.set_data_layout(data_layout);
     _scaled_output.allocator()->init(scale_out_info);
 
     // configure scale function
-    const PadStrideInfo upsample_info(stride_x, stride_y, padx / 2, pady / 2);
+    const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR);
     _scale_f.configure(input, &_scaled_output, upsample_info);
 
     // Setup the function to convolve the upscaled output
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index c5da649..a8167ce 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
@@ -32,13 +33,64 @@
 
 #include <algorithm>
 
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::utils::cast;
 
 namespace
 {
-Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
+Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output,
+                                       GEMMLowpOutputStageInfo &gemmlowp_output_stage)
 {
+    gemmlowp_output_stage.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    gemmlowp_output_stage.gemmlowp_offset     = 0;
+    gemmlowp_output_stage.gemmlowp_multiplier = 0;
+    gemmlowp_output_stage.gemmlowp_shift      = 0;
+
+    // Configure output stage for quantized case
+    if(is_data_type_quantized_asymmetric(input.data_type()))
+    {
+        const UniformQuantizationInfo iq_info = input.quantization_info().uniform();
+        const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = output.quantization_info().uniform();
+
+        const auto output_quant_info = (output.total_size() == 0) ? iq_info : oq_info;
+
+        const float multiplier        = (iq_info.scale * wq_info.scale) / output_quant_info.scale;
+        int         output_multiplier = 0;
+        int         output_shift      = 0;
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
+
+        // Set the GEMMLowp output stage info
+        gemmlowp_output_stage.gemmlowp_offset     = output_quant_info.offset;
+        gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
+        gemmlowp_output_stage.gemmlowp_shift      = output_shift;
+        gemmlowp_output_stage.gemmlowp_min_bound  = 0;
+        gemmlowp_output_stage.gemmlowp_max_bound  = 255;
+        gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier);
+        gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift);
+    }
+
+    return Status{};
+}
+
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo *bias, const ITensorInfo &output, const FullyConnectedLayerInfo &fc_info)
+{
+    GEMMLowpOutputStageInfo gemmlowp_output_stage;
+    ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage));
+
+    const GEMMInfo &gemm_info = GEMMInfo(false,                           // is_a_reshaped
+                                         false,                           // is_b_reshaped
+                                         true,                            // reshape_b_only_on_first_run
+                                         0,                               // depth_output_gemm3d
+                                         false,                           // reinterpret_input_as_3d
+                                         fc_info.retain_internal_weights, // retain_internal_weights
+                                         gemmlowp_output_stage,           // gemmlowp_output_stage
+                                         fc_info.fp_mixed_precision,      // fp_mixed_precision
+                                         true,                            // broadcast_bias
+                                         ActivationLayerInfo());          // activation_info
+
     if(is_data_type_quantized_asymmetric(input.data_type()))
     {
         const UniformQuantizationInfo iq_info = input.quantization_info().uniform();
@@ -52,12 +104,13 @@
         // Validate gemmlowp function
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input.clone()->set_quantization_info(input_quantization_info),
                                                                            &weights.clone()->set_quantization_info(weights_quantization_info),
-                                                                           nullptr,
-                                                                           &output));
+                                                                           bias,
+                                                                           &output,
+                                                                           gemm_info));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input, &weights, nullptr, &output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info));
     }
 
     return Status{};
@@ -76,14 +129,28 @@
     return CLTransposeKernel::validate(input, output);
 }
 
-CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _convert_weights(), _flatten_layer(), _reshape_weights_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(),
-      _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(), _converted_weights_output(), _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true),
-      _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false), _is_prepared(false), _original_weights(nullptr)
+CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+    : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(), _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(), _reshape_weights_function(),
+      _mm_gemm(memory_manager, weights_manager), _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(), _reshape_weights_output(), _are_weights_converted(true),
+      _are_weights_reshaped(true), _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr)
 {
 }
-void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool retain_internal_weights)
+void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info)
 {
+    GEMMLowpOutputStageInfo gemmlowp_output_stage;
+    construct_gemmlowp_output_stage(*input->info(), *weights->info(), *output->info(), gemmlowp_output_stage);
+
+    const GEMMInfo &gemm_info = GEMMInfo(false,                           // is_a_reshaped
+                                         false,                           // is_b_reshaped
+                                         true,                            // reshape_b_only_on_first_run
+                                         0,                               // depth_output_gemm3d
+                                         false,                           // reinterpret_input_as_3d
+                                         fc_info.retain_internal_weights, // retain_internal_weights
+                                         gemmlowp_output_stage,           // gemmlowp_output_stage
+                                         fc_info.fp_mixed_precision,      // fp_mixed_precision
+                                         true,                            // broadcast_bias
+                                         ActivationLayerInfo());          // activation_info
+
     if(_is_quantized)
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
@@ -95,7 +162,7 @@
         weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
 
         // Configure gemmlowp function
-        _mm_gemmlowp.configure(input, weights, nullptr, output);
+        _mm_gemmlowp.configure(input, weights, bias, output, gemm_info);
 
         // Revert back QuantizatioInfo as input and weights could be used in other fully connected layers
         input->info()->set_quantization_info(input_quantization_info);
@@ -104,11 +171,11 @@
     else
     {
         // Configure matrix multiply kernel
-        _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */, 0, false, retain_internal_weights));
+        _mm_gemm.configure(input, weights, bias, output, 1.f, 1.f, gemm_info);
     }
 }
 
-void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool retain_internal_weights)
+void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info)
 {
     ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
 
@@ -123,18 +190,18 @@
     _flatten_layer.configure(input, &_flatten_output);
 
     // Configure matrix multiply kernel
-    configure_mm(&_flatten_output, weights, output, retain_internal_weights);
+    configure_mm(&_flatten_output, weights, bias, output, fc_info);
 
     // Allocate the output tensor for flatten once all the configure methods have been called
     _flatten_output.allocator()->allocate();
 }
 
-void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool retain_internal_weights)
+void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info)
 {
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
 
     // Configure matrix multiply kernel
-    configure_mm(input, weights, output, retain_internal_weights);
+    configure_mm(input, weights, bias, output, fc_info);
 }
 
 void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
@@ -152,27 +219,13 @@
     _are_weights_converted = true;
     _are_weights_reshaped  = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
     _is_fc_after_conv      = true;
-    _accumulate_biases     = false;
     _is_quantized          = is_data_type_quantized_asymmetric(input->info()->data_type());
     _is_prepared           = fc_info.retain_internal_weights;
     _original_weights      = weights;
 
-    // Configure gemmlowp output
-    if(_is_quantized)
+    if(_weights_manager)
     {
-        _gemmlowp_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
-    }
-
-    // Configure accumulate biases kernel for non quantized asymmetric types
-    if(biases != nullptr && !_is_quantized)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-
-        _accumulate_biases = true;
-
-        // Configure accumulate biases kernel
-        _accumulate_biases_kernel.set_target(CLScheduler::get().target());
-        _accumulate_biases_kernel.configure(output, biases);
+        _weights_manager->manage(weights);
     }
 
     const ICLTensor *weights_to_use = weights;
@@ -199,50 +252,51 @@
     // Reshape weights if needed
     if(!_are_weights_reshaped)
     {
-        // Reshape the weights
-        _reshape_weights_kernel.configure(weights, &_reshape_weights_output);
-        weights_to_use = &_reshape_weights_output;
+        if(_weights_manager && _weights_manager->are_weights_managed(weights))
+        {
+            _reshape_weights_managed_function.configure(weights);
+            weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed_function));
+        }
+        else
+        {
+            // Reshape the weights
+            _reshape_weights_function.configure(weights, &_reshape_weights_output);
+            weights_to_use = &_reshape_weights_output;
+        }
     }
 
     // Convert weights if needed
     if(_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout))
     {
-        // Convert weights
-        _convert_weights.configure(weights_to_use,
-                                   &_converted_weights_output,
-                                   input->info()->tensor_shape(),
-                                   fc_info.weights_trained_layout);
+        if(_weights_manager && _weights_manager->are_weights_managed(weights_to_use))
+        {
+            _convert_weights_managed.configure(weights_to_use,
+                                               input->info()->tensor_shape(),
+                                               fc_info.weights_trained_layout);
+            weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_convert_weights_managed));
+        }
+        else
+        {
+            // Convert weights
+            _convert_weights.configure(weights_to_use,
+                                       &_converted_weights_output,
+                                       input->info()->tensor_shape(),
+                                       fc_info.weights_trained_layout);
 
-        weights_to_use         = &_converted_weights_output;
+            weights_to_use = &_converted_weights_output;
+        }
         _are_weights_converted = false;
     }
 
-    // Configure fc core
-    ICLTensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output;
     if(_is_fc_after_conv)
     {
         // Fully Connected layer after a Convolution Layer without batches
-        configure_conv_fc(input, weights_to_use, tmp_output, fc_info.retain_internal_weights);
+        configure_conv_fc(input, weights_to_use, biases, output, fc_info);
     }
     else
     {
         // Fully Connected layer after a Fully Connected Layer without batches
-        configure_fc_fc(input, weights_to_use, tmp_output, fc_info.retain_internal_weights);
-    }
-
-    // Configure output stage for asymmetric quantized types
-    if(_is_quantized)
-    {
-        const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
-        const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
-
-        float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
-        int   output_multiplier;
-        int   output_shift;
-        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-        _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, oq_info.offset);
-        _gemmlowp_output.allocator()->allocate();
+        configure_fc_fc(input, weights_to_use, biases, output, fc_info);
     }
 }
 
@@ -254,22 +308,12 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
 
-    bool            weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
-    bool            is_fc_after_conv = true;
-    bool            is_quantized     = is_data_type_quantized_asymmetric(input->data_type());
-    const GPUTarget gpu_target       = CLScheduler::get().target();
+    bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+    bool is_fc_after_conv = true;
 
     const ITensorInfo &flatten_input     = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input)).set_data_layout(DataLayout::NCHW));
     const ITensorInfo &reshaped_weights  = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
     const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
-    const ITensorInfo &gemmlowp_output   = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
-
-    // Configure accumulate biases kernel for non quantized asymmetric types
-    if(biases != nullptr && !is_quantized)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target));
-    }
 
     // With the Fully Connected layer we can have 4 different cases:
     //  1) Convolution layer -> Fully Connected layer without batches
@@ -279,7 +323,6 @@
 
     const ITensorInfo *input_to_use   = input;
     const ITensorInfo *weights_to_use = weights;
-    const ITensorInfo *tmp_output     = (is_quantized) ? &gemmlowp_output : output;
 
     // Check if we have a fully connected layer with batches
     const bool is_batched_fc_layer = output->dimension(1) > 1;
@@ -325,21 +368,9 @@
         // Fully Connected layer after a Fully Connected Layer without batches
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
     }
+
     // Validate matrix multiply kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output));
-
-    // Validate output stage for asymmetric quantized types
-    if(is_quantized)
-    {
-        const UniformQuantizationInfo iq_info    = input->quantization_info().uniform();
-        const UniformQuantizationInfo wq_info    = weights->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info    = output->quantization_info().uniform();
-        const float                   multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
-
-        ARM_COMPUTE_UNUSED(multiplier);
-        ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&gemmlowp_output, biases, output));
-    }
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info));
 
     return Status{};
 }
@@ -365,26 +396,16 @@
     {
         _mm_gemm.run();
     }
-
-    // Accumulate biases if provided
-    if(_is_quantized)
-    {
-        _gemmlowp_output_stage.run();
-    }
-    else
-    {
-        if(_accumulate_biases)
-        {
-            CLScheduler::get().enqueue(_accumulate_biases_kernel);
-        }
-    }
 }
 
 void CLFullyConnectedLayer::prepare()
 {
     if(!_is_prepared)
     {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+        if(!_weights_manager)
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+        }
 
         auto release_unused = [](CLTensor * w)
         {
@@ -401,22 +422,36 @@
         // Reshape of the weights if needed (happens only once)
         if(!_are_weights_reshaped)
         {
-            // Run reshape weights kernel and mark weights as unused
-            _reshape_weights_output.allocator()->allocate();
-            _reshape_weights_kernel.run();
+            if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
+            {
+                cur_weights = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->run(cur_weights, &_reshape_weights_managed_function));
+            }
+            else
+            {
+                // Run reshape weights kernel and mark weights as unused
+                _reshape_weights_output.allocator()->allocate();
+                _reshape_weights_function.run();
 
-            cur_weights->mark_as_unused();
-            cur_weights           = &_reshape_weights_output;
+                cur_weights->mark_as_unused();
+                cur_weights = &_reshape_weights_output;
+            }
             _are_weights_reshaped = true;
         }
 
         // Convert weights if needed (happens only once)
         if(!_are_weights_converted)
         {
-            _converted_weights_output.allocator()->allocate();
-            _convert_weights.run();
+            if(_weights_manager && _weights_manager->are_weights_managed(cur_weights))
+            {
+                _weights_manager->run(cur_weights, &_convert_weights_managed);
+            }
+            else
+            {
+                _converted_weights_output.allocator()->allocate();
+                _convert_weights.run();
+                cur_weights->mark_as_unused();
+            }
 
-            cur_weights->mark_as_unused();
             _are_weights_converted = true;
         }
 
@@ -436,3 +471,4 @@
         _is_prepared = true;
     }
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index e78395f..8d46014 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -36,6 +36,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/float_ops.h"
+#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
@@ -44,12 +45,15 @@
 {
 using namespace arm_compute::misc::shape_calculator;
 using namespace arm_compute::cl_gemm;
+using namespace arm_compute::utils::cast;
 
-CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
+CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
     : _memory_group(std::move(memory_manager)),
+      _weights_manager(weights_manager),
       _mm_kernel(),
       _reshape_lhs_kernel(),
       _reshape_rhs_kernel(),
+      _reshape_rhs_kernel_managed(),
       _mm_reshaped_kernel(),
       _mm_reshaped_only_rhs_kernel(),
       _tmp_a(),
@@ -65,37 +69,53 @@
 {
     GEMMType gemm_type = GEMMType::RESHAPED_V1;
 
-    if(gpu_target_is_in(gpu_target, GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
+    if(gpu_target_is_in(gpu_target, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
+                        GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72,
+                        GPUTarget::G76, GPUTarget::G77))
     {
-        if((m > 1) && (n < 16))
+        if(data_type == DataType::F32)
         {
-            gemm_type = GEMMType::RESHAPED_V1;
-        }
-        else if((m == 1) && (data_type == DataType::F32))
-        {
-            gemm_type = GEMMType::RESHAPED_ONLY_RHS;
-        }
-        else
-        {
-            // COMPMID-852
-            if((k > 256) && (m > 4) && is_data_type_float(data_type) && reshape_b_only_on_first_run)
+            if((m > 1) && (n < 16))
             {
-                constexpr float alpha = 3.2f;
-                constexpr float fact0 = 1.51f;
-                constexpr float fact1 = 1.66f;
-                constexpr float ops   = 12.0f;
-                const float     scale = k > 1024 ? 1.07f : 1.0f;
-                gemm_type             = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? GEMMType::RESHAPED_V1 : GEMMType::NATIVE;
+                gemm_type = GEMMType::RESHAPED_V1;
+            }
+            else if(m == 1)
+            {
+                gemm_type = GEMMType::RESHAPED_ONLY_RHS;
             }
             else
             {
-                gemm_type = GEMMType::NATIVE;
+                // COMPMID-852
+                if((k > 256) && (m > 4) && reshape_b_only_on_first_run)
+                {
+                    constexpr float alpha = 3.2f;
+                    constexpr float fact0 = 1.51f;
+                    constexpr float fact1 = 1.66f;
+                    constexpr float ops   = 12.0f;
+                    const float     scale = k > 1024 ? 1.07f : 1.0f;
+                    gemm_type             = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? GEMMType::RESHAPED_V1 : GEMMType::NATIVE;
+                }
+                else
+                {
+                    gemm_type = GEMMType::NATIVE;
+                }
+            }
+
+            const auto workload = static_cast<float>((m * n) / 20.0f);
+
+            gemm_type = ((workload > 1600.0f) && (gemm_type == GEMMType::RESHAPED_V1) && (data_type == DataType::F32)) ? GEMMType::RESHAPED_V2 : gemm_type;
+        }
+        else
+        {
+            if((m == 1) || (!reshape_b_only_on_first_run))
+            {
+                gemm_type = GEMMType::RESHAPED_ONLY_RHS;
+            }
+            else
+            {
+                gemm_type = GEMMType::RESHAPED_V2;
             }
         }
-
-        const auto workload = static_cast<float>((m * n) / 20.0f);
-
-        gemm_type = ((workload > 1600.0f) && (gemm_type == GEMMType::RESHAPED_V1) && (data_type == DataType::F32)) ? GEMMType::RESHAPED_V2 : gemm_type;
     }
     else
     {
@@ -162,8 +182,12 @@
 
     GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias());
 
+    const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b));
+
+    // Manage intermediate buffers
     _memory_group.manage(&_tmp_a);
-    if(!_reshape_b_only_on_first_run)
+
+    if(!_reshape_b_only_on_first_run && use_mm_b)
     {
         _memory_group.manage(&_tmp_b);
     }
@@ -172,16 +196,26 @@
     _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
 
     // Configure transpose kernel
-    _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+    ICLTensor *reshaped_rhs = &_tmp_b;
+    if(_weights_manager && _weights_manager->are_weights_managed(b))
+    {
+        _reshape_rhs_kernel_managed.configure(b, rhs_info);
+        reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
+    }
+    else
+    {
+        _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+    }
 
     // Configure and tune matrix multiply kernel
-    _mm_kernel.configure(&_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
+    _mm_kernel.configure(&_tmp_a, reshaped_rhs, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
 
     CLScheduler::get().tune_kernel_static(_mm_kernel);
 
     // Allocate intermediate tensors
     _tmp_a.allocator()->allocate();
-    if(!_reshape_b_only_on_first_run)
+
+    if(!_reshape_b_only_on_first_run && use_mm_b)
     {
         _tmp_b.allocator()->allocate();
     }
@@ -212,12 +246,16 @@
     _reshape_lhs_kernel.set_target(gpu_target);
     _mm_kernel.set_target(gpu_target);
 
+    const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b));
+
     // Manage intermediate buffers
     _memory_group.manage(&_tmp_a);
-    if(!_reshape_b_only_on_first_run)
+
+    if(!_reshape_b_only_on_first_run && use_mm_b)
     {
         _memory_group.manage(&_tmp_b);
     }
+
     // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
 
     GEMMLHSMatrixInfo lhs_info{};
@@ -231,14 +269,25 @@
     std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
 
     _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
-    _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+
+    ICLTensor *reshaped_rhs = &_tmp_b;
+    if(_weights_manager && _weights_manager->are_weights_managed(b))
+    {
+        _reshape_rhs_kernel_managed.configure(b, rhs_info);
+        reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
+    }
+    else
+    {
+        _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+    }
 
     // Configure and tune matrix multiply kernel
-    _mm_reshaped_kernel.configure(&_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+    _mm_reshaped_kernel.configure(&_tmp_a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
 
     // Allocate intermediate tensors
     _tmp_a.allocator()->allocate();
-    if(!_reshape_b_only_on_first_run)
+
+    if(!_reshape_b_only_on_first_run && use_mm_b)
     {
         _tmp_b.allocator()->allocate();
     }
@@ -268,8 +317,10 @@
     // Set the target for the kernels
     _mm_kernel.set_target(gpu_target);
 
+    const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b));
+
     // Manage intermediate buffers
-    if(!_reshape_b_only_on_first_run)
+    if(!_reshape_b_only_on_first_run && use_mm_b)
     {
         _memory_group.manage(&_tmp_b);
     }
@@ -284,12 +335,21 @@
     // Configure lhs_info and rhs_info
     std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
 
-    _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+    ICLTensor *reshaped_rhs = &_tmp_b;
+    if(_weights_manager && _weights_manager->are_weights_managed(b))
+    {
+        _reshape_rhs_kernel_managed.configure(b, rhs_info);
+        reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
+    }
+    else
+    {
+        _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+    }
 
     // Configure and tune matrix multiply kernel
-    _mm_reshaped_only_rhs_kernel.configure(a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+    _mm_reshaped_only_rhs_kernel.configure(a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
 
-    if(!_reshape_b_only_on_first_run)
+    if(!_reshape_b_only_on_first_run && use_mm_b)
     {
         _tmp_b.allocator()->allocate();
     }
@@ -591,7 +651,14 @@
             if(!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+                if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
+                {
+                    _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
+                }
+                else
+                {
+                    CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+                }
             }
 
             CLScheduler::get().enqueue(_mm_kernel, true);
@@ -605,7 +672,14 @@
             if(!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+                if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
+                {
+                    _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
+                }
+                else
+                {
+                    CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+                }
             }
 
             CLScheduler::get().enqueue(_mm_reshaped_kernel, true);
@@ -616,7 +690,14 @@
             if(!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+                if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
+                {
+                    _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
+                }
+                else
+                {
+                    CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+                }
             }
 
             CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, true);
@@ -635,10 +716,17 @@
     {
         if(_gemm_type != GEMMType::NATIVE && _reshape_b_only_on_first_run)
         {
-            // Run transpose kernel and mark original weights tensor as unused
-            _tmp_b.allocator()->allocate();
-            CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
-            _original_b->mark_as_unused();
+            if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
+            {
+                _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
+            }
+            else
+            {
+                // Run transpose kernel and mark original weights tensor as unused
+                _tmp_b.allocator()->allocate();
+                CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+                _original_b->mark_as_unused();
+            }
         }
         CLScheduler::get().queue().finish();
         _is_prepared = true;
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index be6be04..d322723 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
@@ -35,8 +36,10 @@
 #include <memory>
 #include <tuple>
 
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::utils::cast;
 
 CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights()
     : _weights_reshape_kernel()
@@ -63,13 +66,14 @@
 Status CLConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
 
     if(biases != nullptr)
     {
         const int idx_kernels = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
-        ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(weights->data_type()));
+        ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(weights->data_type()));
+
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
@@ -78,7 +82,6 @@
     if((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
-
         CLWeightsReshapeKernel::validate(weights, biases, output, num_groups);
     }
 
@@ -90,9 +93,10 @@
     CLScheduler::get().enqueue(_weights_reshape_kernel);
 }
 
-CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(),
-      _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _is_prepared(false)
+CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+    : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(), _mm_gemm(memory_manager, weights_manager),
+      _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _skip_im2col(false),
+      _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _is_prepared(false)
 {
 }
 
@@ -197,9 +201,9 @@
 
     const unsigned int kernel_width  = weights->info()->dimension(idx_width);
     const unsigned int kernel_height = weights->info()->dimension(idx_height);
+    const unsigned int num_kernels   = weights->info()->dimension(idx_kernels);
 
     const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
-    const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
     const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
 
     _is_prepared      = weights_info.retain_internal_weights();
@@ -233,11 +237,12 @@
                                                  conv_info,
                                                  dilation);
 
-    unsigned int mat_weights_cols = weights->info()->dimension(idx_kernels) / num_groups;
+    unsigned int mat_weights_cols = num_kernels / num_groups;
 
     const ICLTensor *biases_to_use = biases;
     bool             append_bias   = false;
 
+    ICLTensor *weights_to_use = &_weights_reshaped;
     if(num_groups != 1 && biases != nullptr)
     {
         // num_groups != 1 can only be for NCHW
@@ -245,11 +250,27 @@
         biases_to_use = nullptr;
         append_bias   = true;
 
-        _reshape_weights.configure(weights, biases, &_weights_reshaped, num_groups);
+        if(_weights_manager && _weights_manager->are_weights_managed(weights))
+        {
+            _reshape_weights_managed.configure(weights, biases, num_groups);
+            weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed));
+        }
+        else
+        {
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, num_groups);
+        }
     }
     else
     {
-        _reshape_weights.configure(weights, nullptr, &_weights_reshaped, num_groups);
+        if(_weights_manager && _weights_manager->are_weights_managed(weights))
+        {
+            _reshape_weights_managed.configure(weights, nullptr, num_groups);
+            weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed));
+        }
+        else
+        {
+            _reshape_weights.configure(weights, nullptr, &_weights_reshaped, num_groups);
+        }
     }
 
     // Create tensor to store im2col reshaped inputs
@@ -289,20 +310,28 @@
     }
 
     GEMMLowpOutputStageInfo gemmlowp_output_stage;
-    gemmlowp_output_stage.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-    gemmlowp_output_stage.gemmlowp_offset     = 0;
-    gemmlowp_output_stage.gemmlowp_multiplier = 0;
-    gemmlowp_output_stage.gemmlowp_shift      = 0;
+    gemmlowp_output_stage.type            = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    gemmlowp_output_stage.gemmlowp_offset = 0;
 
     // Configure output stage for quantized case
     if(_is_quantized)
     {
-        const auto output_quant_info = (output->info()->total_size() == 0) ? iq_info : oq_info;
+        const auto         output_quant_info        = (output->info()->total_size() == 0) ? iq_info : oq_info;
+        const bool         is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type());
+        const unsigned int num_filters              = (is_quantized_per_channel) ? num_kernels : 1;
 
-        const float multiplier        = (iq_info.scale * wq_info.scale) / output_quant_info.scale;
-        int         output_multiplier = 0;
-        int         output_shift      = 0;
-        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+        gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel;
+
+        gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters);
+        gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters);
+        quantization::compute_quantized_multipliers_and_shifts(input->info(),
+                                                               weights->info(),
+                                                               output->info(),
+                                                               idx_kernels,
+                                                               gemmlowp_output_stage.gemmlowp_multipliers.data(),
+                                                               gemmlowp_output_stage.gemmlowp_shifts.data());
+        gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0];
+        gemmlowp_output_stage.gemmlowp_shift      = gemmlowp_output_stage.gemmlowp_shifts[0];
 
         int min_activation = 0;
         int max_activation = 0;
@@ -329,18 +358,16 @@
         }
 
         // Set the GEMMLowp output stage info
-        gemmlowp_output_stage.gemmlowp_offset     = output_quant_info.offset;
-        gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
-        gemmlowp_output_stage.gemmlowp_shift      = output_shift;
-        gemmlowp_output_stage.gemmlowp_min_bound  = min_activation;
-        gemmlowp_output_stage.gemmlowp_max_bound  = max_activation;
+        gemmlowp_output_stage.gemmlowp_offset    = output_quant_info.offset;
+        gemmlowp_output_stage.gemmlowp_min_bound = min_activation;
+        gemmlowp_output_stage.gemmlowp_max_bound = max_activation;
     }
 
     // Configure and tune GEMM
     // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
 
-    configure_mm(gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, act_info);
+    configure_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, act_info);
 
     if(!_skip_im2col)
     {
@@ -375,8 +402,17 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
+    const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type());
+
+    if(is_quantized_per_channel)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() != DataType::QASYMM8, "Input data type not compatible with Weights");
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    }
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_type() == DataType::QASYMM8), "Grouping (num_groups != 1) is not supported with QASYMM8");
@@ -391,6 +427,7 @@
 
     const unsigned int kernel_width  = weights->dimension(idx_width);
     const unsigned int kernel_height = weights->dimension(idx_height);
+    const unsigned int num_kernels   = weights->dimension(idx_kernels);
 
     TensorInfo         im2col_reshaped_info{};
     TensorInfo         info_gemm{};
@@ -398,15 +435,10 @@
     const ITensorInfo *gemm_input_to_use  = input;
     const ITensorInfo *gemm_output_to_use = output;
     const ITensorInfo *weights_to_use     = weights;
-
-    const bool is_quantized    = is_data_type_quantized_asymmetric(data_type);
-    const bool skip_im2col     = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-    const bool skip_col2im     = data_layout == DataLayout::NHWC;
-    bool       fuse_activation = true;
-
-    const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
-    const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
+    const bool         is_quantized       = is_data_type_quantized_asymmetric(data_type);
+    const bool         skip_im2col        = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+    const bool         skip_col2im        = data_layout == DataLayout::NHWC;
+    bool               fuse_activation    = true;
 
     ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * num_groups) != input->dimension(idx_channel));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
@@ -442,7 +474,7 @@
                                                  conv_info,
                                                  dilation);
 
-    unsigned int mat_weights_cols = weights->dimension(idx_kernels) / num_groups;
+    unsigned int mat_weights_cols = num_kernels / num_groups;
 
     const ITensorInfo *biases_to_use = biases;
     bool               append_bias   = false;
@@ -493,20 +525,27 @@
     }
 
     GEMMLowpOutputStageInfo gemmlowp_output_stage;
-    gemmlowp_output_stage.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-    gemmlowp_output_stage.gemmlowp_offset     = 0;
-    gemmlowp_output_stage.gemmlowp_multiplier = 0;
-    gemmlowp_output_stage.gemmlowp_shift      = 0;
+    gemmlowp_output_stage.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    gemmlowp_output_stage.gemmlowp_offset          = 0;
+    gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel;
 
     if(is_quantized)
     {
-        const auto output_quant_info = (output->total_size() == 0) ? iq_info : oq_info;
+        const UniformQuantizationInfo iq_info           = input->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info           = output->quantization_info().uniform();
+        const auto                    output_quant_info = (output->total_size() == 0) ? iq_info : oq_info;
+        const unsigned int            num_filters       = (is_quantized_per_channel) ? num_kernels : 1;
 
-        const float multiplier        = (iq_info.scale * wq_info.scale) / output_quant_info.scale;
-        int         output_multiplier = 0;
-        int         output_shift      = 0;
-
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
+        gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters);
+        gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters);
+        quantization::compute_quantized_multipliers_and_shifts(input,
+                                                               weights,
+                                                               output,
+                                                               idx_kernels,
+                                                               gemmlowp_output_stage.gemmlowp_multipliers.data(),
+                                                               gemmlowp_output_stage.gemmlowp_shifts.data());
+        gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0];
+        gemmlowp_output_stage.gemmlowp_shift      = gemmlowp_output_stage.gemmlowp_shifts[0];
 
         int min_activation = 0;
         int max_activation = 0;
@@ -533,11 +572,9 @@
         }
 
         // Set the GEMMLowp output stage info
-        gemmlowp_output_stage.gemmlowp_offset     = output_quant_info.offset;
-        gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
-        gemmlowp_output_stage.gemmlowp_shift      = output_shift;
-        gemmlowp_output_stage.gemmlowp_min_bound  = min_activation;
-        gemmlowp_output_stage.gemmlowp_max_bound  = max_activation;
+        gemmlowp_output_stage.gemmlowp_offset    = output_quant_info.offset;
+        gemmlowp_output_stage.gemmlowp_min_bound = min_activation;
+        gemmlowp_output_stage.gemmlowp_max_bound = max_activation;
     }
 
     // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
@@ -602,11 +639,17 @@
     if(!_is_prepared)
     {
         ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        // Run weights reshaping and mark original weights tensor as unused
-        _weights_reshaped.allocator()->allocate();
-        _reshape_weights.run();
-        _original_weights->mark_as_unused();
+        if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
+        {
+            _weights_manager->run(_original_weights, &_reshape_weights_managed);
+        }
+        else
+        {
+            // Run weights reshaping and mark original weights tensor as unused
+            _weights_reshaped.allocator()->allocate();
+            _reshape_weights.run();
+            _original_weights->mark_as_unused();
+        }
 
         // Prepare GEMM
         _is_quantized ? _mm_gemmlowp.prepare() : _mm_gemm.prepare();
@@ -619,3 +662,4 @@
         _is_prepared = true;
     }
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
index 36a120e..4671be5 100644
--- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
@@ -151,8 +151,8 @@
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true), &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info));
     }
 
-    auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h),
-                                                    0, 0, deconv_info.stride().first, deconv_info.stride().second);
+    const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
+    auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), stride_info);
     const TensorShape deconv_shape       = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
     TensorInfo        col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
 
@@ -279,7 +279,7 @@
     {
         const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
         const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = _gemmlowp_final.info()->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
 
         float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
         int   output_multiplier(0);
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 0286cb3..4c0a521 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -32,6 +32,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 namespace arm_compute
@@ -49,6 +50,7 @@
 
 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
+      _weights_to_qasymm8(),
       _mm_midgard_kernel(),
       _mm_native_kernel(),
       _mm_reshaped_only_rhs_kernel(),
@@ -57,18 +59,24 @@
       _mtx_b_reduction_kernel(),
       _offset_contribution_kernel(),
       _offset_contribution_output_stage_kernel(),
+      _qasymm8_weights(),
       _vector_sum_col(),
       _vector_sum_row(),
       _tmp_b(),
       _mm_result_s32(),
+      _gemm_output_stage_multipliers(),
+      _gemm_output_stage_shifts(),
+      _matrix_a(nullptr),
       _original_b(nullptr),
+      _output(nullptr),
       _a_offset(0),
       _b_offset(0),
       _is_gemm_reshaped(true),
       _is_midgard(false),
       _reshape_b_only_on_first_run(false),
       _is_prepared(false),
-      _fuse_output_stage(false)
+      _fuse_output_stage(false),
+      _convert_to_qasymm8(false)
 {
 }
 
@@ -81,7 +89,12 @@
     _original_b                  = b;
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
     _a_offset                    = a->info()->quantization_info().uniform().offset;
-    _b_offset                    = b->info()->quantization_info().uniform().offset;
+    _matrix_a                    = a;
+    _output                      = output;
+
+    _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->info()->data_type()) && is_data_type_quantized_symmetric(b->info()->data_type())
+                          && is_data_type_quantized_asymmetric(a->info()->data_type());
+    _b_offset = _convert_to_qasymm8 ? -128 : b->info()->quantization_info().uniform().offset;
 
     // Get the GPU target
     const GPUTarget gpu_target = CLScheduler::get().target();
@@ -91,8 +104,6 @@
     _mm_native_kernel.set_target(gpu_target);
     _mm_reshaped_only_rhs_kernel.set_target(gpu_target);
 
-    const ICLTensor *matrix_a = a;
-    const ICLTensor *matrix_b = b;
     GEMMRHSMatrixInfo rhs_info;
     GEMMLHSMatrixInfo lhs_info;
 
@@ -110,6 +121,16 @@
     _is_gemm_reshaped = is_gemm_reshaped(_reshape_b_only_on_first_run, gpu_target);
     _is_midgard       = gpu_target == GPUTarget::MIDGARD;
 
+    if(_convert_to_qasymm8)
+    {
+        // Set data type for converted weights
+        TensorInfo weights_info(*b->info());
+        weights_info.set_data_type(DataType::QASYMM8);
+        _qasymm8_weights.allocator()->init(weights_info);
+        _weights_to_qasymm8.configure(b, &_qasymm8_weights, ConvertPolicy::WRAP, 0);
+    }
+
+    const ICLTensor *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
     if(_is_gemm_reshaped)
     {
         matrix_b = &_tmp_b;
@@ -123,7 +144,7 @@
         std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
 
         // Configure reshape RHS kernel
-        _mtx_b_reshape_kernel.configure(b, &_tmp_b, rhs_info);
+        _mtx_b_reshape_kernel.configure(_convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
     }
 
     // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
@@ -137,7 +158,7 @@
         }
 
         // Configure Matrix B reduction kernel
-        _mtx_b_reduction_kernel.configure(b, &_vector_sum_col);
+        _mtx_b_reduction_kernel.configure(_convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col);
     }
 
     // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
@@ -161,14 +182,14 @@
         if(_is_gemm_reshaped)
         {
             // Configure and tune matrix multiply kernel
-            _mm_reshaped_only_rhs_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+            _mm_reshaped_only_rhs_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
         }
         else
         {
             if(_is_midgard)
             {
                 // Configure matrix multiply kernel
-                _mm_midgard_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+                _mm_midgard_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
             }
             else
             {
@@ -176,13 +197,27 @@
                 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
 
                 // Configure matrix multiply kernel
-                _mm_native_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+                _mm_native_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
             }
         }
-
         // Configure offset contribution kernel
+        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
+
+        _gemm_output_stage_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
+        _gemm_output_stage_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
+
         _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
-                                                           _a_offset, _b_offset, gemm_info.gemmlowp_output_stage());
+                                                           _a_offset, _b_offset, gemm_info.gemmlowp_output_stage(), &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+
+        _gemm_output_stage_multipliers.allocator()->allocate();
+        _gemm_output_stage_shifts.allocator()->allocate();
+        // Compute GEMM output multipliers and shifts for output stage
+        _gemm_output_stage_multipliers.map();
+        _gemm_output_stage_shifts.map();
+        std::memcpy(_gemm_output_stage_multipliers.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t));
+        std::memcpy(_gemm_output_stage_shifts.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
+        _gemm_output_stage_multipliers.unmap();
+        _gemm_output_stage_shifts.unmap();
 
         _mm_result_s32.allocator()->allocate();
     }
@@ -191,14 +226,14 @@
         if(_is_gemm_reshaped)
         {
             // Configure and tune matrix multiply kernel
-            _mm_reshaped_only_rhs_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+            _mm_reshaped_only_rhs_kernel.configure(_matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
         }
         else
         {
             if(_is_midgard)
             {
                 // Configure matrix multiply kernel
-                _mm_midgard_kernel.configure(matrix_a, matrix_b, output, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+                _mm_midgard_kernel.configure(_matrix_a, matrix_b, output, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
             }
             else
             {
@@ -206,7 +241,7 @@
                 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
 
                 // Configure matrix multiply kernel
-                _mm_native_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+                _mm_native_kernel.configure(_matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
             }
         }
 
@@ -237,7 +272,15 @@
 Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+    if(b->data_type() == DataType::QSYMM8_PER_CHANNEL)
+    {
+        //DataType::QSYMM8_PER_CHANNEL supported only for weights
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() != DataType::QASYMM8, "Matrix A is not quantized while Matrix B is");
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+    }
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
 
@@ -245,7 +288,6 @@
     int32_t b_offset = b->quantization_info().uniform().offset;
 
     const ITensorInfo *matrix_a_info = a;
-    const ITensorInfo *matrix_b_info = b;
 
     TensorInfo        tmp_b_info{};
     GEMMRHSMatrixInfo rhs_info;
@@ -266,6 +308,16 @@
 
     const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
 
+    bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type())
+                              && is_data_type_quantized_asymmetric(a->data_type());
+    TensorInfo weights_info(*b);
+    if(convert_to_qasymm8)
+    {
+        b_offset = -128;
+        weights_info.set_data_type(DataType::QASYMM8);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConvertLayerKernel::validate(b, &weights_info, ConvertPolicy::WRAP, 0));
+    }
+    const ITensorInfo *matrix_b_info = &weights_info;
     if(reshape_matrix_b)
     {
         matrix_b_info = &tmp_b_info;
@@ -274,8 +326,8 @@
         std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
 
         // Validate reshape RHS kernel
-        auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+        auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info));
     }
 
     TensorInfo info_vector_sum_col{};
@@ -284,10 +336,10 @@
     // Validate matrix B reduction kernel only if _a_offset is not equal to 0
     if(a_offset != 0)
     {
-        info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
+        info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32);
 
         // Configure Matrix B reduction kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col));
     }
 
     // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
@@ -332,13 +384,19 @@
         }
 
         // Validate offset contribution kernel
+        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
+
+        const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
+
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
                                                                                             a_offset == 0 ? nullptr : &info_vector_sum_col,
                                                                                             b_offset == 0 ? nullptr : &info_vector_sum_row,
                                                                                             c,
                                                                                             output,
                                                                                             a_offset, b_offset,
-                                                                                            gemm_info.gemmlowp_output_stage()));
+                                                                                            gemm_info.gemmlowp_output_stage(),
+                                                                                            &gemm_output_stage_multipliers_shifts_info,
+                                                                                            &gemm_output_stage_multipliers_shifts_info));
     }
     else
     {
@@ -438,6 +496,12 @@
 {
     if(!_is_prepared)
     {
+        if(_convert_to_qasymm8)
+        {
+            _qasymm8_weights.allocator()->allocate();
+            CLScheduler::get().enqueue(_weights_to_qasymm8, false);
+        }
+
         if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
         {
             ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index d712a23..c9eb8ab 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
@@ -30,26 +30,33 @@
 namespace arm_compute
 {
 CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)),
+    : _memory_group(memory_manager),
       _permute_deltas_kernel(),
       _flatten_deltas_kernel(),
       _permute_scores_kernel(),
       _flatten_scores_kernel(),
       _compute_anchors_kernel(),
       _bounding_box_kernel(),
-      _memset_kernel(),
-      _padded_copy_kernel(),
-      _cpp_nms_kernel(),
+      _pad_kernel(),
+      _dequantize_anchors(),
+      _dequantize_deltas(),
+      _quantize_all_proposals(),
+      _cpp_nms(memory_manager),
       _is_nhwc(false),
+      _is_qasymm8(false),
       _deltas_permuted(),
       _deltas_flattened(),
+      _deltas_flattened_f32(),
       _scores_permuted(),
       _scores_flattened(),
       _all_anchors(),
+      _all_anchors_f32(),
       _all_proposals(),
+      _all_proposals_quantized(),
       _keeps_nms_unused(),
       _classes_nms_unused(),
       _proposals_4_roi_values(),
+      _all_proposals_to_use(nullptr),
       _num_valid_proposals(nullptr),
       _scores_out(nullptr)
 {
@@ -61,63 +68,93 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
     ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
 
-    _is_nhwc                         = scores->info()->data_layout() == DataLayout::NHWC;
-    const DataType data_type         = deltas->info()->data_type();
-    const int      num_anchors       = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
-    const int      feat_width        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
-    const int      feat_height       = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
-    const int      total_num_anchors = num_anchors * feat_width * feat_height;
-    const int      pre_nms_topN      = info.pre_nms_topN();
-    const int      post_nms_topN     = info.post_nms_topN();
-    const size_t   values_per_roi    = info.values_per_roi();
+    _is_nhwc                        = scores->info()->data_layout() == DataLayout::NHWC;
+    const DataType scores_data_type = scores->info()->data_type();
+    _is_qasymm8                     = scores_data_type == DataType::QASYMM8;
+    const int    num_anchors        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
+    const int    feat_width         = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
+    const int    feat_height        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
+    const int    total_num_anchors  = num_anchors * feat_width * feat_height;
+    const int    pre_nms_topN       = info.pre_nms_topN();
+    const int    post_nms_topN      = info.post_nms_topN();
+    const size_t values_per_roi     = info.values_per_roi();
+
+    const QuantizationInfo scores_qinfo   = scores->info()->quantization_info();
+    const DataType         rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type;
+    const QuantizationInfo rois_qinfo     = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
 
     // Compute all the anchors
     _memory_group.manage(&_all_anchors);
     _compute_anchors_kernel.configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
 
     const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
-    _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, data_type));
+    _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
 
     // Permute and reshape deltas
+    _memory_group.manage(&_deltas_flattened);
     if(!_is_nhwc)
     {
         _memory_group.manage(&_deltas_permuted);
-        _memory_group.manage(&_deltas_flattened);
         _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
         _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
         _deltas_permuted.allocator()->allocate();
     }
     else
     {
-        _memory_group.manage(&_deltas_flattened);
         _flatten_deltas_kernel.configure(deltas, &_deltas_flattened);
     }
 
     const TensorShape flatten_shape_scores(1, total_num_anchors);
-    _scores_flattened.allocator()->init(TensorInfo(flatten_shape_scores, 1, data_type));
+    _scores_flattened.allocator()->init(TensorInfo(flatten_shape_scores, 1, scores_data_type, scores_qinfo));
 
     // Permute and reshape scores
+    _memory_group.manage(&_scores_flattened);
     if(!_is_nhwc)
     {
         _memory_group.manage(&_scores_permuted);
-        _memory_group.manage(&_scores_flattened);
         _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
         _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
         _scores_permuted.allocator()->allocate();
     }
     else
     {
-        _memory_group.manage(&_scores_flattened);
         _flatten_scores_kernel.configure(scores, &_scores_flattened);
     }
 
+    CLTensor *anchors_to_use = &_all_anchors;
+    CLTensor *deltas_to_use  = &_deltas_flattened;
+    if(_is_qasymm8)
+    {
+        _all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32));
+        _deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32));
+        _memory_group.manage(&_all_anchors_f32);
+        _memory_group.manage(&_deltas_flattened_f32);
+        // Dequantize anchors to float
+        _dequantize_anchors.configure(&_all_anchors, &_all_anchors_f32);
+        _all_anchors.allocator()->allocate();
+        anchors_to_use = &_all_anchors_f32;
+        // Dequantize deltas to float
+        _dequantize_deltas.configure(&_deltas_flattened, &_deltas_flattened_f32);
+        _deltas_flattened.allocator()->allocate();
+        deltas_to_use = &_deltas_flattened_f32;
+    }
     // Bounding box transform
     _memory_group.manage(&_all_proposals);
     BoundingBoxTransformInfo bbox_info(info.im_width(), info.im_height(), 1.f);
-    _bounding_box_kernel.configure(&_all_anchors, &_all_proposals, &_deltas_flattened, bbox_info);
-    _deltas_flattened.allocator()->allocate();
-    _all_anchors.allocator()->allocate();
+    _bounding_box_kernel.configure(anchors_to_use, &_all_proposals, deltas_to_use, bbox_info);
+    deltas_to_use->allocator()->allocate();
+    anchors_to_use->allocator()->allocate();
 
+    _all_proposals_to_use = &_all_proposals;
+    if(_is_qasymm8)
+    {
+        _memory_group.manage(&_all_proposals_quantized);
+        // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset
+        _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
+        _quantize_all_proposals.configure(&_all_proposals, &_all_proposals_quantized);
+        _all_proposals.allocator()->allocate();
+        _all_proposals_to_use = &_all_proposals_quantized;
+    }
     // The original layer implementation first selects the best pre_nms_topN anchors (thus having a lightweight sort)
     // that are then transformed by bbox_transform. The boxes generated are then fed into a non-sorting NMS operation.
     // Since we are reusing the NMS layer and we don't implement any CL/sort, we let NMS do the sorting (of all the input)
@@ -128,12 +165,12 @@
     _memory_group.manage(&_keeps_nms_unused);
 
     // Note that NMS needs outputs preinitialized.
-    auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, data_type);
-    auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, data_type);
+    auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo);
+    auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo);
     auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32);
 
     // Initialize temporaries (unused) outputs
-    _classes_nms_unused.allocator()->init(TensorInfo(TensorShape(1, 1), 1, data_type));
+    _classes_nms_unused.allocator()->init(TensorInfo(TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo));
     _keeps_nms_unused.allocator()->init(*scores_out->info());
 
     // Save the output (to map and unmap them at run)
@@ -141,26 +178,26 @@
     _num_valid_proposals = num_valid_proposals;
 
     _memory_group.manage(&_proposals_4_roi_values);
-    _cpp_nms_kernel.configure(&_scores_flattened, &_all_proposals, nullptr, scores_out, &_proposals_4_roi_values, &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals,
-                              BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height()));
+    _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values, &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals,
+                       BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height()));
     _keeps_nms_unused.allocator()->allocate();
     _classes_nms_unused.allocator()->allocate();
-    _all_proposals.allocator()->allocate();
+    _all_proposals_to_use->allocator()->allocate();
     _scores_flattened.allocator()->allocate();
 
     // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
-    _padded_copy_kernel.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+    _pad_kernel.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
     _proposals_4_roi_values.allocator()->allocate();
-
-    _memset_kernel.configure(proposals, PixelValue());
 }
 
 Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
                                           const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(scores, DataLayout::NCHW, DataLayout::NHWC);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas);
 
     const int num_anchors       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
     const int feat_width        = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
@@ -169,8 +206,17 @@
     const int total_num_anchors = num_anchors * feat_width * feat_height;
     const int values_per_roi    = info.values_per_roi();
 
+    const bool is_qasymm8 = scores->data_type() == DataType::QASYMM8;
+
     ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1);
 
+    if(is_qasymm8)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16);
+        const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform();
+        ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f);
+    }
+
     TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
     ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
 
@@ -190,15 +236,36 @@
     TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
     ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&deltas_permuted_info, &deltas_flattened_info));
 
-    TensorInfo scores_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
+    TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
     TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&scores_permuted_info, &scores_flattened_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, BoundingBoxTransformInfo(info.im_width(), info.im_height(),
-                                                                       1.f)));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(&proposals_4_roi_values, proposals, PaddingList{ { 0, 1 } }));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(proposals, PixelValue()));
+    TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values;
+    TensorInfo  proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0));
+    if(is_qasymm8)
+    {
+        TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayerKernel::validate(&all_anchors_info, &all_anchors_f32_info));
+
+        TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayerKernel::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
+
+        TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
+                                                                           BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayerKernel::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
+        proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized;
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
+                                                                           BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+    }
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } }));
 
     if(num_valid_proposals->total_size() > 0)
     {
@@ -212,7 +279,17 @@
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1);
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(proposals, deltas);
+        if(is_qasymm8)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16);
+            const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform();
+            ARM_COMPUTE_RETURN_ERROR_ON(proposals_qinfo.scale != 0.125f);
+            ARM_COMPUTE_RETURN_ERROR_ON(proposals_qinfo.offset != 0);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(proposals, scores);
+        }
     }
 
     if(scores_out->total_size() > 0)
@@ -229,7 +306,7 @@
 {
     // Map inputs
     _scores_flattened.map(true);
-    _all_proposals.map(true);
+    _all_proposals_to_use->map(true);
 
     // Map outputs
     _scores_out->map(CLScheduler::get().queue(), true);
@@ -239,7 +316,7 @@
     _classes_nms_unused.map(true);
 
     // Run nms
-    CPPScheduler::get().schedule(&_cpp_nms_kernel, Window::DimX);
+    _cpp_nms.run();
 
     // Unmap outputs
     _keeps_nms_unused.unmap();
@@ -250,7 +327,7 @@
 
     // Unmap inputs
     _scores_flattened.unmap();
-    _all_proposals.unmap();
+    _all_proposals_to_use->unmap();
 }
 
 void CLGenerateProposalsLayer::run()
@@ -270,12 +347,23 @@
     CLScheduler::get().enqueue(_flatten_deltas_kernel, false);
     CLScheduler::get().enqueue(_flatten_scores_kernel, false);
 
+    if(_is_qasymm8)
+    {
+        CLScheduler::get().enqueue(_dequantize_anchors, false);
+        CLScheduler::get().enqueue(_dequantize_deltas, false);
+    }
+
     // Build the boxes
     CLScheduler::get().enqueue(_bounding_box_kernel, false);
+
+    if(_is_qasymm8)
+    {
+        CLScheduler::get().enqueue(_quantize_all_proposals, false);
+    }
+
     // Non maxima suppression
     run_cpp_nms_kernel();
     // Add dummy batch indexes
-    CLScheduler::get().enqueue(_memset_kernel, true);
-    CLScheduler::get().enqueue(_padded_copy_kernel, true);
+    CLScheduler::get().enqueue(_pad_kernel, true);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
new file mode 100644
index 0000000..2b0987f
--- /dev/null
+++ b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+CLInstanceNormalizationLayer::CLInstanceNormalizationLayer()
+{
+}
+
+void CLInstanceNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLInstanceNormalizationLayerKernel>();
+    k->configure(input, output, gamma, beta, epsilon);
+    _kernel = std::move(k);
+}
+
+Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
+{
+    return CLInstanceNormalizationLayerKernel::validate(input, output, gamma, beta, epsilon);
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index e76e4f6..7d1c818 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -38,7 +38,7 @@
 {
 constexpr int max_input_tensor_dim = 3;
 } // namespace
-    
+
 CLL2NormalizeLayer::CLL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
 {
@@ -46,6 +46,9 @@
 
 void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, int axis, float epsilon)
 {
+    // Reset auxiliary tensor
+    _sumsq.allocator()->init(TensorInfo());
+
     // Manage intermediate buffers
     _memory_group.manage(&_sumsq);
 
diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
index 11cf85e..e5f1278 100644
--- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
@@ -159,8 +159,7 @@
     const float multiplier        = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
     int         output_multiplier = 0;
     int         output_shift      = 0;
-
-    quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+    quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
 
     _memory_group.manage(&_output_lowp);
     _output_stage.configure(&_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift);
@@ -361,12 +360,13 @@
     input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
     weights_transposed.set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
 
-    // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12))
     const TensorInfo output_lowp(output_highp.tensor_shape(), 1, DataType::QSYMM16, qsymm_3);
 
-    const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
-    ARM_COMPUTE_UNUSED(multiplier);
-    ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f);
+    const float multiplier        = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
+    int         output_multiplier = 0;
+    int         output_shift      = 0;
+    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+
     // _output_stage
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&output_highp, &bias_concatenated, &output_lowp));
 
@@ -504,7 +504,7 @@
     _tanh_output_state.run();
     _mul_output_state_tmp_output_gate.run();
 
-    // Requantize output state from QSYMM16 to QASYMM16
+    // Requantize output state from QSYMM16 to QASYMM8
     _dequantize.run();
     _quantize.run();
 }
@@ -553,4 +553,4 @@
     }
 }
 
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index 99e3121..8f36a69 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,183 +23,25 @@
  */
 #include "arm_compute/runtime/CL/functions/CLPadLayer.h"
 
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/ToolchainSupport.h"
-
 namespace arm_compute
 {
 CLPadLayer::CLPadLayer()
-    : _copy_kernel(), _mode(), _padding(), _memset_kernel(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results()
+    : _pad_kernel(), _copy_kernel(), _perform_pad(false)
 {
 }
 
-void CLPadLayer::configure_constant_mode(ICLTensor *input, ICLTensor *output, const PaddingList &padding, const PixelValue constant_value)
-{
-    // Set the pages of the output to the constant_value.
-    _memset_kernel.configure(output, constant_value);
-
-    // Fill out padding list with zeroes.
-    PaddingList padding_extended = padding;
-    for(size_t i = padding.size(); i < TensorShape::num_max_dimensions; i++)
-    {
-        padding_extended.emplace_back(PaddingInfo{ 0, 0 });
-    }
-
-    // Create a window within the output tensor where the input will be copied.
-    Window copy_window = Window();
-    for(uint32_t i = 0; i < output->info()->num_dimensions(); ++i)
-    {
-        copy_window.set(i, Window::Dimension(padding_extended[i].first, padding_extended[i].first + input->info()->dimension(i), 1));
-    }
-    // Copy the input to the output, leaving the padding filled with the constant_value.
-    _copy_kernel.configure(input, output, PaddingList(), &copy_window);
-}
-
-void CLPadLayer::configure_reflect_symmetric_mode(ICLTensor *input, ICLTensor *output)
-{
-    int64_t last_padding_dimension = _padding.size() - 1;
-    // Reflecting can be performed by effectively unfolding the input as follows:
-    // For each dimension starting at DimX:
-    //      Create a before and after slice, which values depend on the selected padding mode
-    //      Concatenate the before and after padding with the tensor to be padded
-
-    // Two strided slice functions will be required for each dimension padded as well as a
-    // concatenate function and the tensors to hold the temporary results.
-    _slice_functions.resize(2 * _num_dimensions);
-    _slice_results.resize(2 * _num_dimensions);
-    _concat_functions.resize(_num_dimensions);
-    _concat_results.resize(_num_dimensions - 1);
-
-    Coordinates starts_before{};
-    Coordinates ends_before{};
-    Coordinates starts_after{};
-    Coordinates ends_after{};
-    Coordinates strides{};
-    ICLTensor *prev = input;
-    for(uint32_t i = 0; i < _num_dimensions; ++i)
-    {
-        // Values in strides from the previous dimensions need to be set to 1 to avoid reversing again.
-        if(i > 0)
-        {
-            strides.set(i - 1, 1);
-        }
-
-        if(_padding[i].first > 0 || _padding[i].second > 0)
-        {
-            // Set the starts, ends, and strides values for the current dimension.
-            // Due to the bit masks passed to strided slice, the values below the current dimension in
-            // starts and ends will be ignored so do not need to be modified.
-            if(_mode == PaddingMode::REFLECT)
-            {
-                starts_before.set(i, _padding[i].first);
-                ends_before.set(i, 0);
-                starts_after.set(i, input->info()->dimension(i) - 2);
-                ends_after.set(i, input->info()->dimension(i) - _padding[i].second - 2);
-                strides.set(i, -1);
-            }
-            else
-            {
-                starts_before.set(i, _padding[i].first - 1);
-                ends_before.set(i, -1);
-                starts_after.set(i, input->info()->dimension(i) - 1);
-                ends_after.set(i, input->info()->dimension(i) - _padding[i].second - 1);
-                strides.set(i, -1);
-            }
-
-            // Strided slice wraps negative indexes around to the end of the range,
-            // instead this should indicate use of the full range and so the bit mask will be modified.
-            const int32_t begin_mask_before = starts_before[i] < 0 ? ~0 : ~(1u << i);
-            const int32_t end_mask_before   = ends_before[i] < 0 ? ~0 : ~(1u << i);
-            const int32_t begin_mask_after  = starts_after[i] < 0 ? ~0 : ~(1u << i);
-            const int32_t end_mask_after    = ends_after[i] < 0 ? ~0 : ~(1u << i);
-
-            // Reflect the input values for the padding before and after the input.
-            std::vector<ICLTensor *> concat_vector;
-            if(_padding[i].first > 0)
-            {
-                if(i < prev->info()->num_dimensions())
-                {
-                    _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, begin_mask_before, end_mask_before);
-                    concat_vector.push_back(&_slice_results[2 * i]);
-                }
-                else
-                {
-                    // Performing the slice is unnecessary if the result would simply be a copy of the tensor.
-                    concat_vector.push_back(prev);
-                }
-            }
-            concat_vector.push_back(prev);
-            if(_padding[i].second > 0)
-            {
-                if(i < prev->info()->num_dimensions())
-                {
-                    _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, strides, begin_mask_after, end_mask_after);
-                    concat_vector.push_back(&_slice_results[2 * i + 1]);
-                }
-                else
-                {
-                    // Performing the slice is unnecessary if the result would simply be a copy of the tensor.
-                    concat_vector.push_back(prev);
-                }
-            }
-            // Concatenate the padding before and after with the input.
-            ICLTensor *out = (static_cast<int32_t>(i) == last_padding_dimension) ? output : &_concat_results[i];
-            _concat_functions[i].configure(concat_vector, out, i);
-            prev = out;
-        }
-    }
-    for(uint32_t i = 0; i < _num_dimensions; ++i)
-    {
-        if((static_cast<int32_t>(i) != last_padding_dimension))
-        {
-            _concat_results[i].allocator()->allocate();
-        }
-        _slice_results[2 * i].allocator()->allocate();
-        _slice_results[2 * i + 1].allocator()->allocate();
-    }
-}
-
 void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
 
-    _padding = padding;
-    _mode    = mode;
-
-    TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding);
-
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(padded_shape));
-
-    // Find the last dimension requiring padding so that it is known when to write to output and whether any padding is applied.
-    int64_t last_padding_dimension = _padding.size() - 1;
-    for(; last_padding_dimension >= 0; --last_padding_dimension)
+    _perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info)
     {
-        if(_padding[last_padding_dimension].first > 0 || _padding[last_padding_dimension].second > 0)
-        {
-            break;
-        }
-    }
-    _num_dimensions = last_padding_dimension + 1;
-    if(_num_dimensions > 0)
+        return info.first > 0 || info.second > 0;
+    });
+
+    if(_perform_pad)
     {
-        switch(_mode)
-        {
-            case PaddingMode::CONSTANT:
-            {
-                configure_constant_mode(input, output, padding, constant_value);
-                break;
-            }
-            case PaddingMode::REFLECT:
-            case PaddingMode::SYMMETRIC:
-            {
-                configure_reflect_symmetric_mode(input, output);
-                break;
-            }
-            default:
-                ARM_COMPUTE_ERROR("Padding mode not supported.");
-        }
+        _pad_kernel.configure(input, output, padding, constant_value, mode);
     }
     else
     {
@@ -207,111 +49,34 @@
         _copy_kernel.configure(input, output);
     }
 }
-
 Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > input->num_dimensions());
-
-    TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
-
-    // Use CLCopyKernel and CLMemsetKernel to validate all padding modes as this includes all of the shape and info validation.
-    PaddingList padding_extended = padding;
-    for(size_t i = padding.size(); i < TensorShape::num_max_dimensions; i++)
+    bool perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info)
     {
-        padding_extended.emplace_back(PaddingInfo{ 0, 0 });
-    }
+        return info.first > 0 || info.second > 0;
+    });
 
-    Window copy_window = Window();
-    for(uint32_t i = 0; i < padded_shape.num_dimensions(); ++i)
+    if(perform_pad)
     {
-        copy_window.set(i, Window::Dimension(padding_extended[i].first, padding_extended[i].first + input->dimension(i), 1));
-    }
-    if(output->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(output, input);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, PaddingList(), &copy_window));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, constant_value));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(input, output, padding, constant_value, mode));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, &input->clone()->set_tensor_shape(padded_shape), PaddingList(), &copy_window));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(&input->clone()->set_tensor_shape(padded_shape), constant_value));
-    }
-
-    switch(mode)
-    {
-        case PaddingMode::CONSTANT:
-        {
-            break;
-        }
-        case PaddingMode::REFLECT:
-        case PaddingMode::SYMMETRIC:
-        {
-            for(uint32_t i = 0; i < padding.size(); ++i)
-            {
-                if(mode == PaddingMode::REFLECT)
-                {
-                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first >= input->dimension(i));
-                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second >= input->dimension(i));
-                }
-                else
-                {
-                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first > input->dimension(i));
-                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second > input->dimension(i));
-                }
-            }
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Invalid mode");
-        }
+        Window copy_window = Window();
+        copy_window.use_tensor_dimensions(output->tensor_shape());
+        ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, PaddingList(), &copy_window));
     }
     return Status{};
 }
-
 void CLPadLayer::run()
 {
-    if(_num_dimensions > 0)
+    if(_perform_pad)
     {
-        switch(_mode)
-        {
-            case PaddingMode::CONSTANT:
-            {
-                CLScheduler::get().enqueue(_memset_kernel, false);
-                CLScheduler::get().enqueue(_copy_kernel, true);
-                break;
-            }
-            case PaddingMode::REFLECT:
-            case PaddingMode::SYMMETRIC:
-            {
-                for(uint32_t i = 0; i < _num_dimensions; ++i)
-                {
-                    if(_padding[i].first > 0 || _padding[i].second > 0)
-                    {
-                        if(_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0)
-                        {
-                            _slice_functions[2 * i].run();
-                        }
-                        if(_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0)
-                        {
-                            _slice_functions[2 * i + 1].run();
-                        }
-                        CLScheduler::get().sync();
-                        _concat_functions[i].run();
-                        CLScheduler::get().sync();
-                    }
-                }
-                break;
-            }
-            default:
-                ARM_COMPUTE_ERROR("Padding mode not supported.");
-        }
+        CLScheduler::get().enqueue(_pad_kernel);
     }
     else
     {
-        CLScheduler::get().enqueue(_copy_kernel, true);
+        CLScheduler::get().enqueue(_copy_kernel);
     }
 }
-} // namespace arm_compute
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 38f0a75..3aa5a81 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -26,15 +26,17 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 unsigned int calculate_number_of_stages(const ITensorInfo *input, unsigned int axis)
@@ -56,17 +58,52 @@
 } // namespace
 
 CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_serial()
+    : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _reshape_kernel(), _op(), _num_of_stages(), _reduction_axis(), _is_serial(),
+      _is_reshape_required(false)
 {
 }
 
-Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
-    const unsigned int num_of_stages = calculate_number_of_stages(input, axis);
-    bool               is_serial     = is_data_type_quantized(input->data_type()) || axis != 0;
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
+
+    const unsigned int num_of_stages       = calculate_number_of_stages(input, axis);
+    const bool         is_serial           = needs_serialized_reduction(op, input->data_type(), axis);
+    const bool         is_arg_min_max      = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
+    const bool         is_reshape_required = !keep_dims || is_arg_min_max;
+
+    if(is_reshape_required)
+    {
+        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
+    }
+
+    auto *output_internal = output;
+
+    TensorInfo output_before_reshape;
+    const auto input_shape        = input->tensor_shape();
+    const auto input_data_type    = input->data_type();
+    const auto input_num_channles = input->num_channels();
+    const auto input_qinfo        = input->quantization_info();
+    const auto output_data_type   = is_arg_min_max ? DataType::S32 : output->data_type();
+
+    auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo)
+    {
+        ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo);
+    };
+
+    if(is_reshape_required)
+    {
+        auto shape_before_reshape = input_shape;
+        shape_before_reshape.set(axis, 1);
+        initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles, input_qinfo);
+        output_internal = &output_before_reshape;
+    }
+
     if(is_serial)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output_internal, axis, op));
     }
     else
     {
@@ -74,14 +111,13 @@
         std::vector<TensorInfo> sums_vector(num_of_stages - 1);
 
         // Create intermediate tensor info
-        TensorShape shape{ input->tensor_shape() };
+        TensorShape shape{ input_shape };
+
+        shape.set(0, ceil(shape.x() / 128.f));
 
         for(unsigned int i = 0; i < num_of_stages - 1; i++)
         {
-            shape.set(0, ceil(shape.x() / 128.f));
-            sums_vector[i].set_data_type(input->data_type());
-            sums_vector[i].set_tensor_shape(shape);
-            sums_vector[i].set_num_channels(input->num_channels());
+            initialize_tensorinfo(sums_vector[i], shape, input_data_type, input_num_channles, input_qinfo);
         }
 
         ReductionOperation first_kernel_op;
@@ -130,17 +166,72 @@
 
         // Validate ReductionOperation on the last stage
         const unsigned int last_stage = num_of_stages - 1;
-        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[last_stage - 1], output, axis, last_kernel_op, input->dimension(0)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[last_stage - 1], output_internal, axis, last_kernel_op, input->dimension(0)));
+    }
+
+    if(is_reshape_required)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(output_internal, output));
     }
 
     return Status{};
 }
 
-void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output)
 {
-    _num_of_stages  = calculate_number_of_stages(input->info(), axis);
-    _reduction_axis = axis;
-    _is_serial      = is_data_type_quantized(input->info()->data_type()) || axis != 0;
+    if(!_is_reshape_required && _is_serial)
+    {
+        return output;
+    }
+
+    auto       intermediate_result_vector_size = _is_serial ? 1 : _num_of_stages;
+    const auto is_arg_min_max                  = (_op == ReductionOperation::ARG_IDX_MAX || _op == ReductionOperation::ARG_IDX_MIN);
+
+    if(!_is_reshape_required)
+    {
+        --intermediate_result_vector_size;
+    }
+
+    _results_vector.resize(intermediate_result_vector_size);
+    auto shape = input->info()->tensor_shape();
+
+    shape.set(_reduction_axis, _is_serial ? 1 : ceil(shape.x() / 128.f));
+
+    for(auto &v : _results_vector)
+    {
+        if(&v == &_results_vector.back() && _is_reshape_required)
+        {
+            shape.set(_reduction_axis, 1);
+        }
+        v.allocator()->init(input->info()->clone()->set_tensor_shape(shape));
+    }
+
+    if(is_arg_min_max)
+    {
+        _results_vector.back().info()->set_data_type(DataType::S32).set_is_resizable(true).reset_padding();
+    }
+
+    return _is_reshape_required ? &_results_vector.back() : output;
+}
+
+void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+{
+    _op                       = op;
+    _num_of_stages            = calculate_number_of_stages(input->info(), axis);
+    _reduction_axis           = axis;
+    _is_serial                = needs_serialized_reduction(op, input->info()->data_type(), axis);
+    const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
+    _is_reshape_required      = !keep_dims || is_arg_min_max;
+
+    auto *output_internal = configure_intermediate_result_vector(input, output);
+
+    // ArgMinMax might not give initialized output tensor, so initialize here.
+    if(_is_reshape_required)
+    {
+        const TensorShape output_shape     = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+        const auto        output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type();
+        auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+    }
 
     // Configure reduction operation kernels
     _reduction_kernels_vector.resize(_num_of_stages);
@@ -148,20 +239,16 @@
     // Create temporary tensors
     if(_is_serial)
     {
-        _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
+        if(_is_reshape_required)
+        {
+            _memory_group.manage(&_results_vector.back());
+        }
+
+        _reduction_kernels_vector[0].configure(input, output_internal, axis, op, 0);
     }
     else
     {
         _border_handlers_vector.resize(_num_of_stages);
-        _results_vector.resize(_num_of_stages - 1);
-        TensorShape shape{ input->info()->tensor_shape() };
-        for(unsigned int i = 0; i < _num_of_stages - 1; i++)
-        {
-            shape.set(0, ceil(shape.x() / 128.f));
-            _results_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
-        }
-
-        // Apply ReductionOperation only on first kernel
         _memory_group.manage(&_results_vector[0]);
 
         ReductionOperation first_kernel_op;
@@ -262,10 +349,22 @@
         // Apply ReductionOperation on the last stage
         const unsigned int last_stage  = _num_of_stages - 1;
         const unsigned int input_width = input->info()->dimension(0);
-        _reduction_kernels_vector[last_stage].configure(&_results_vector[last_stage - 1], output, axis, last_kernel_op, input_width);
+
+        if(_is_reshape_required)
+        {
+            _memory_group.manage(&_results_vector.back());
+        }
+
+        _reduction_kernels_vector[last_stage].configure(&_results_vector[last_stage - 1], output_internal, axis, last_kernel_op, input_width);
         _border_handlers_vector[last_stage].configure(&_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
         _results_vector[last_stage - 1].allocator()->allocate();
     }
+
+    if(_is_reshape_required)
+    {
+        _reshape_kernel.configure(&_results_vector.back(), output);
+        _results_vector.back().allocator()->allocate();
+    }
 }
 
 void CLReductionOperation::run()
@@ -284,4 +383,10 @@
             CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
         }
     }
+
+    if(_is_reshape_required)
+    {
+        CLScheduler::get().enqueue(_reshape_kernel, false);
+    }
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index 7e41dba..32d7f44 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -30,18 +30,19 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 namespace arm_compute
 {
-CLSoftmaxLayer::CLSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+template <bool IS_LOG>
+CLSoftmaxLayerGeneric<IS_LOG>::CLSoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _max_shift_exp_sum_kernel(), _norm_kernel(), _flatten_kernel_ptr(), _reshape_kernel(), _max(), _sum(), _tmp(), _input_flattened(), _output_flattened(),
       _needs_flattening(false)
 {
 }
 
-void CLSoftmaxLayer::configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t axis)
+template <bool IS_LOG>
+void CLSoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t axis)
 {
     // Flatten the input
     const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), axis);
@@ -70,11 +71,12 @@
     auto_init_if_empty(*output->info(), *input->info()->clone());
 }
 
-void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float beta, size_t axis)
+template <bool IS_LOG>
+void CLSoftmaxLayerGeneric<IS_LOG>::configure(const ICLTensor *input, ICLTensor *output, float beta, size_t axis)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLSoftmaxLayer::validate(input->info(), output->info(), beta, axis));
+    ARM_COMPUTE_ERROR_THROW_ON(CLSoftmaxLayerGeneric<IS_LOG>::validate(input->info(), output->info(), beta, axis));
 
     // We don't need flattening only in the case the input is 2D and axis is 1
     _needs_flattening = axis != 1;
@@ -115,8 +117,12 @@
     _memory_group.manage(&_max);
     _memory_group.manage(&_sum);
 
+    SoftmaxKernelInfo softmax_info;
+    softmax_info.beta   = beta;
+    softmax_info.is_log = IS_LOG;
+
     // Configure kernels
-    _max_shift_exp_sum_kernel.configure(input_2D, &_max, &_tmp, &_sum, beta);
+    _max_shift_exp_sum_kernel.configure(input_2D, &_max, &_tmp, &_sum, softmax_info);
 
     if(_needs_flattening)
     {
@@ -124,7 +130,7 @@
         _memory_group.manage(&_output_flattened);
 
         // The normalization kernel stores the result in a flat output tensor
-        _norm_kernel.configure(&_tmp, &_sum, &_output_flattened, beta);
+        _norm_kernel.configure(&_tmp, &_sum, &_output_flattened, softmax_info);
 
         // Reshape the flat output into a the requested (4D) output
         _reshape_kernel.configure(&_output_flattened, output);
@@ -136,7 +142,7 @@
     else
     {
         // Softmax 2D case
-        _norm_kernel.configure(&_tmp, &_sum, output, beta);
+        _norm_kernel.configure(&_tmp, &_sum, output, softmax_info);
     }
 
     // Allocate intermediate buffers
@@ -145,7 +151,8 @@
     _sum.allocator()->allocate();
 }
 
-Status CLSoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
+template <bool IS_LOG>
+Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported");
@@ -189,7 +196,8 @@
     return Status{};
 }
 
-void CLSoftmaxLayer::run()
+template <bool IS_LOG>
+void CLSoftmaxLayerGeneric<IS_LOG>::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
@@ -207,4 +215,7 @@
     }
 }
 
+template class CLSoftmaxLayerGeneric<false>;
+template class CLSoftmaxLayerGeneric<true>;
+
 } // namespace arm_compute