arm_compute v19.11

commit: 0e205f7e1083bf26e6ffc33d96005999b541b8f9 [log] [tgz]
author: Jenkins <bsgcomp@arm.com> Thu Nov 28 16:53:35 2019 +0000
committer: Jenkins <bsgcomp@arm.com> Thu Nov 28 16:53:35 2019 +0000
tree: 8e204985bbc0a9588f67128cafe49c0419c6d3f9
parent: 975dfe175e3d7c62c27598b1c0e8e77ed90df463 [diff]
diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
index 6af71a3..1b86514 100644
--- a/src/runtime/NEON/functions/NEActivationLayer.cpp
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,10 +24,15 @@
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 
 #include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
+#include "arm_compute/runtime/IRuntimeContext.h"
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
+NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx) // NOLINT
+    : INESimpleFunctionNoBorder(ctx)
+{
+}
 void NEActivationLayer::configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEActivationLayerKernel>();
@@ -39,3 +44,4 @@
 {
     return NEActivationLayerKernel::validate(input, output, act_info);
 }
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
index 6863bb0..a23061e 100644
--- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp

@@ -29,41 +29,28 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
 
 namespace arm_compute
 {
 NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduction_kernel(), _fill_border_kernel(), _run_fill_border(false)
+    : _reduction_function(support::cpp14::make_unique<NEReductionOperation>())
 {
+    ARM_COMPUTE_UNUSED(memory_manager);
 }
 void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, const ReductionOperation &op)
 {
-    _reduction_kernel.configure(input, output, axis, op);
-
-    if(axis == 0)
-    {
-        _fill_border_kernel.configure(input, _reduction_kernel.border_size(), BorderMode::REPLICATE);
-        _run_fill_border = true;
-    }
+    _reduction_function->configure(input, output, axis, op, false);
 }
 
 Status NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
-    ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output, axis, op));
-    return Status{};
+    return NEReductionOperation::validate(input, output, axis, op, false);
 }
 
 void NEArgMinMaxLayer::run()
 {
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    if(_run_fill_border)
-    {
-        NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
-    }
-    NEScheduler::get().schedule(&_reduction_kernel, Window::DimY);
+    _reduction_function->run();
 }
 
 } // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
new file mode 100644
index 0000000..818e228
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp

@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NEBoundingBoxTransform::configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info)
+{
+    // Configure Bounding Box kernel
+    auto k = arm_compute::support::cpp14::make_unique<NEBoundingBoxTransformKernel>();
+    k->configure(boxes, pred_boxes, deltas, info);
+    _kernel = std::move(k);
+}
+
+Status NEBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+{
+    return NEBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index 032e617..3d5fbfb 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp

@@ -37,6 +37,7 @@
 #include "support/ToolchainSupport.h"
 
 #include <cstring>
+#include <inttypes.h>
 #include <utility>
 
 using namespace arm_compute;
@@ -118,7 +119,7 @@
     }
     else
     {
-        ARM_COMPUTE_ERROR("Gradient size %d not supported\n", gradient_size);
+        ARM_COMPUTE_ERROR_VAR("Gradient size %+" PRId32 " not supported\n", gradient_size);
     }
 
     // Manage intermediate buffers

diff --git a/src/runtime/NEON/functions/NEComputeAllAnchors.cpp b/src/runtime/NEON/functions/NEComputeAllAnchors.cpp
new file mode 100644
index 0000000..7702fb0
--- /dev/null
+++ b/src/runtime/NEON/functions/NEComputeAllAnchors.cpp

@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEComputeAllAnchors.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NEComputeAllAnchors::configure(const ITensor *anchors, ITensor *all_anchors, const ComputeAnchorsInfo &info)
+{
+    // Configure ComputeAllAnchors kernel
+    auto k = arm_compute::support::cpp14::make_unique<NEComputeAllAnchorsKernel>();
+    k->configure(anchors, all_anchors, info);
+    _kernel = std::move(k);
+}
+
+Status NEComputeAllAnchors::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
+{
+    return NEComputeAllAnchorsKernel::validate(anchors, all_anchors, info);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
index b5b159a..f65c035 100644
--- a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
+++ b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights()
     : _kernel()
 {
@@ -46,3 +46,4 @@
 {
     NEScheduler::get().schedule(&_kernel, Window::DimZ);
 }
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 1f2cc3d..0411b41 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp

@@ -64,13 +64,8 @@
     const unsigned int height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1);
-    ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
 
-    const unsigned int stride_x = info.stride().first;
-    const unsigned int stride_y = info.stride().second;
-
-    auto out_dims = deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), weights->dimension(height_idx),
-                                                    info.pad().first, info.pad().second, stride_x, stride_y);
+    auto out_dims = deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), weights->dimension(height_idx), info);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     if(bias != nullptr)
@@ -96,9 +91,11 @@
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
     }
 
-    unsigned int        padx            = 0;
-    unsigned int        pady            = 0;
-    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, padx, pady);
+    unsigned int        deconv_pad_x    = 0;
+    unsigned int        deconv_pad_y    = 0;
+    const unsigned int  stride_x        = info.stride().first;
+    const unsigned int  stride_y        = info.stride().second;
+    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
     TensorInfo          scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
@@ -126,14 +123,17 @@
     _is_prepared      = false;
     _is_nchw          = data_layout == DataLayout::NCHW;
 
-    const unsigned int stride_x = info.stride().first;
-    const unsigned int stride_y = info.stride().second;
+    const unsigned int pad_left   = info.pad_left();
+    const unsigned int pad_right  = info.pad_right();
+    const unsigned int pad_top    = info.pad_top();
+    const unsigned int pad_bottom = info.pad_bottom();
+    const unsigned int stride_x   = info.stride().first;
+    const unsigned int stride_y   = info.stride().second;
 
     const unsigned int width_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    auto               out_dims   = deconvolution_output_dimensions(input->info()->dimension(width_idx), input->info()->dimension(height_idx), weights->info()->dimension(width_idx),
-                                                                    weights->info()->dimension(height_idx),
-                                                                    info.pad().first, info.pad().second, stride_x, stride_y);
+    auto               out_dims   = deconvolution_output_dimensions(input->info()->dimension(width_idx), input->info()->dimension(height_idx),
+                                                                    weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info);
 
     const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
     // Output auto initialization if not yet initialized
@@ -157,16 +157,30 @@
         _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
 
         // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
-        unsigned int      padx            = 0;
-        unsigned int      pady            = 0;
-        const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*_permuted_input.info(), *_permuted_weights.info(), stride_x, stride_y, out_dims, padx,
-                                                                                  pady);
+        unsigned int      deconv_pad_x    = 0;
+        unsigned int      deconv_pad_y    = 0;
+        const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*_permuted_input.info(), *_permuted_weights.info(), stride_x, stride_y, out_dims,
+                                                                                  deconv_pad_x, deconv_pad_y);
+
+        unsigned int deconv_pad_left  = pad_right > pad_left ? pad_right - pad_left : 0;
+        unsigned int deconv_pad_right = pad_left > pad_right ? pad_left - pad_right : 0;
+        deconv_pad_x -= deconv_pad_left + deconv_pad_right;
+        ARM_COMPUTE_ERROR_ON((deconv_pad_x % 2) != 0);
+        deconv_pad_left += deconv_pad_x / 2;
+        deconv_pad_right += deconv_pad_x / 2;
+
+        unsigned int deconv_pad_top    = pad_bottom > pad_top ? pad_bottom - pad_top : 0;
+        unsigned int deconv_pad_bottom = pad_top > pad_bottom ? pad_top - pad_bottom : 0;
+        deconv_pad_y -= deconv_pad_top + deconv_pad_bottom;
+        ARM_COMPUTE_ERROR_ON((deconv_pad_y % 2) != 0);
+        deconv_pad_top += deconv_pad_y / 2;
+        deconv_pad_bottom += deconv_pad_y / 2;
 
         TensorInfo scale_out_info(scale_out_shape, 1, _permuted_input.info()->data_type(), _permuted_input.info()->quantization_info());
         scale_out_info.set_data_layout(DataLayout::NCHW);
         _scaled_output.allocator()->init(scale_out_info);
 
-        const PadStrideInfo upsample_info(stride_x, stride_y, padx / 2, pady / 2);
+        const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR);
         _upsample_f.configure(&_permuted_input, &_scaled_output, upsample_info);
 
         _weights_flipped.allocator()->init(*_permuted_weights.info()->clone());
@@ -189,14 +203,30 @@
     else
     {
         // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
-        unsigned int      padx            = 0;
-        unsigned int      pady            = 0;
-        const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, out_dims, padx, pady);
+        unsigned int      deconv_pad_x    = 0;
+        unsigned int      deconv_pad_y    = 0;
+        const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y,
+                                                                                  out_dims, deconv_pad_x, deconv_pad_y);
+
+        unsigned int deconv_pad_left  = pad_right > pad_left ? pad_right - pad_left : 0;
+        unsigned int deconv_pad_right = pad_left > pad_right ? pad_left - pad_right : 0;
+        deconv_pad_x -= deconv_pad_left + deconv_pad_right;
+        ARM_COMPUTE_ERROR_ON((deconv_pad_x % 2) != 0);
+        deconv_pad_left += deconv_pad_x / 2;
+        deconv_pad_right += deconv_pad_x / 2;
+
+        unsigned int deconv_pad_top    = pad_bottom > pad_top ? pad_bottom - pad_top : 0;
+        unsigned int deconv_pad_bottom = pad_top > pad_bottom ? pad_top - pad_bottom : 0;
+        deconv_pad_y -= deconv_pad_top + deconv_pad_bottom;
+        ARM_COMPUTE_ERROR_ON((deconv_pad_y % 2) != 0);
+        deconv_pad_top += deconv_pad_y / 2;
+        deconv_pad_bottom += deconv_pad_y / 2;
 
         TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
         scale_out_info.set_data_layout(data_layout);
         _scaled_output.allocator()->init(scale_out_info);
-        const PadStrideInfo upsample_info(stride_x, stride_y, padx / 2, pady / 2);
+
+        const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR);
         _upsample_f.configure(input, &_scaled_output, upsample_info);
 
         _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));

diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index fbdee84..6cf7b97 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp

@@ -23,548 +23,20 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h"
 
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "support/ToolchainSupport.h"
-
-#include "arm_compute/core/utils/misc/InfoHelpers.h"
 
 using namespace arm_compute::misc;
 using namespace arm_compute::misc::shape_calculator;
 
 namespace arm_compute
 {
-NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _dwc_kernel(), _dwc_optimized_func(memory_manager), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(),
-      _activationlayer_function(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_optimized(false),
-      _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
+namespace
 {
-}
-
-void NEDepthwiseConvolutionLayer3x3::configure_generic(ITensor                   *input,
-                                                       const ITensor             *weights,
-                                                       const ITensor             *biases,
-                                                       ITensor                   *output,
-                                                       const PadStrideInfo       &conv_info,
-                                                       unsigned int               depth_multiplier,
-                                                       const ActivationLayerInfo &act_info,
-                                                       const Size2D              &dilation)
-{
-    ARM_COMPUTE_UNUSED(act_info);
-
-    PixelValue zero_value(0.f);
-
-    // Initialize the intermediate accumulator tensor in case of quantized input
-    if(_is_quantized)
-    {
-        TensorShape accum_shape  = output->info()->tensor_shape();
-        DataLayout  accum_layout = output->info()->data_layout();
-        if(!_is_nchw)
-        {
-            permute(accum_shape, PermutationVector(1U, 2U, 0U));
-            accum_layout = DataLayout::NCHW;
-        }
-
-        _memory_group.manage(&_accumulator);
-        _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info()));
-        _accumulator.info()->set_data_layout(accum_layout);
-        zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().uniform().offset));
-    }
-
-    if(!_is_nchw)
-    {
-        _memory_group.manage(&_permuted_input);
-        _memory_group.manage(&_permuted_output);
-
-        // Configure the function to transform the input tensor from NHWC -> NCHW
-        _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
-        _permuted_input.info()->set_data_layout(DataLayout::NCHW);
-
-        // Configure the function to transform the weights tensor from HWI -> IHW
-        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
-        _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
-        _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
-
-        // Configure depthwise
-        _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier, dilation);
-
-        // Configure border handler
-        _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
-
-        // Allocate tensors
-        _permuted_input.allocator()->allocate();
-    }
-    else
-    {
-        // Configure depthwise convolution kernel
-        _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier, dilation);
-
-        // Configure border handler
-        _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
-    }
-
-    // Configure biases accumulation
-    if(_is_quantized)
-    {
-        const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
-        const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = (output->info()->total_size() == 0) ? iq_info : output->info()->quantization_info().uniform();
-
-        float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
-        int   output_multiplier;
-        int   output_shift;
-        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-        _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, oq_info.offset);
-        _accumulator.allocator()->allocate();
-    }
-    else if(_has_bias)
-    {
-        _output_stage_kernel.configure(_is_nchw ? output : &_permuted_output, biases);
-    }
-
-    // Permute output
-    if(!_is_nchw)
-    {
-        // Configure the function to transform the convoluted output to NHWC
-        _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
-        _permuted_output.allocator()->allocate();
-    }
-}
-
-void NEDepthwiseConvolutionLayer3x3::configure_optimized(const ITensor             *input,
-                                                         const ITensor             *weights,
-                                                         const ITensor             *biases,
-                                                         ITensor                   *output,
-                                                         const PadStrideInfo       &conv_info,
-                                                         unsigned int               depth_multiplier,
-                                                         const ActivationLayerInfo &act_info)
-{
-    ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
-    const bool          is_relu         = arm_compute::utils::info_helpers::is_relu(act_info);
-    const bool          is_relu6        = arm_compute::utils::info_helpers::is_relu6(act_info);
-    _is_activationlayer_enabled         = act_info.enabled() && !(is_relu || is_relu6);
-    if(!_is_activationlayer_enabled)
-    {
-        act_info_to_use = act_info;
-    }
-
-    if(_is_nchw)
-    {
-        _memory_group.manage(&_permuted_input);
-        _memory_group.manage(&_permuted_output);
-
-        // Configure the function to transform the input tensor from NCHW -> NHWC
-        _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
-        _permuted_input.info()->set_data_layout(DataLayout::NHWC);
-
-        // Configure the function to transform the weights tensor from IHW -> HWI
-        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
-        _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
-
-        _permuted_output.info()->set_data_layout(DataLayout::NHWC);
-        _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
-
-        // Configure optimized depthwise
-        _dwc_optimized_func.configure(&_permuted_input, &_permuted_weights, biases, &_permuted_output, conv_info, depth_multiplier, act_info_to_use);
-
-        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
-        _permuted_output.info()->set_data_layout(DataLayout::NHWC);
-        _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
-
-        // Allocate tensors
-        _permuted_input.allocator()->allocate();
-        _permuted_output.allocator()->allocate();
-    }
-    else
-    {
-        _dwc_optimized_func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info_to_use);
-    }
-}
-
-void NEDepthwiseConvolutionLayer3x3::configure(ITensor       *input,
-                                               const ITensor *weights,
-                                               const ITensor *biases,
-                                               ITensor *output, const PadStrideInfo &conv_info,
-                                               unsigned int               depth_multiplier,
-                                               const ActivationLayerInfo &act_info,
-                                               const Size2D              &dilation)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer3x3::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
-                                                                        output->info(), conv_info, depth_multiplier, act_info, dilation));
-
-    _original_weights = weights;
-    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _has_bias         = biases != nullptr;
-    _is_optimized     = NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input->info(),
-                                                                                       weights->info(),
-                                                                                       conv_info,
-                                                                                       depth_multiplier, dilation);
-    _is_nchw                    = input->info()->data_layout() == DataLayout::NCHW;
-    _permute                    = _is_optimized == _is_nchw;
-    _is_prepared                = false;
-    _is_activationlayer_enabled = act_info.enabled();
-
-    // Configure appropriate pipeline
-    if(_is_optimized)
-    {
-        configure_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info);
-    }
-    else
-    {
-        configure_generic(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-    }
-
-    // Configure activation
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.configure(output, nullptr, act_info);
-    }
-}
-
-Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo         *input,
-                                                const ITensorInfo         *weights,
-                                                const ITensorInfo         *biases,
-                                                const ITensorInfo         *output,
-                                                const PadStrideInfo       &conv_info,
-                                                unsigned int               depth_multiplier,
-                                                const ActivationLayerInfo &act_info,
-                                                const Size2D              &dilation)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
-    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
-
-    if(biases != nullptr)
-    {
-        const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
-    }
-
-    if(!NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation))
-    {
-        const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
-        TensorInfo accumulator  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier));
-
-        if(is_quantized)
-        {
-            const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
-            const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
-            const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
-
-            float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
-            int   output_multiplier;
-            int   output_shift;
-            ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, biases, output, output_multiplier, output_shift, oq_info.offset));
-        }
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, conv_info, depth_multiplier));
-    }
-
-    //Validate Activation Layer
-    if(act_info.enabled())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
-    }
-
-    return Status{};
-}
-
-void NEDepthwiseConvolutionLayer3x3::run_generic()
-{
-    // Fill border
-    NEScheduler::get().schedule(&_border_handler, Window::DimX);
-
-    // Execute depthwise convolution
-    NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
-
-    // Add biases
-    if(_has_bias || _is_quantized)
-    {
-        NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
-    }
-
-    // Permute output
-    if(!_is_nchw)
-    {
-        _permute_output.run();
-    }
-}
-
-void NEDepthwiseConvolutionLayer3x3::run_optimized()
-{
-    // Run assembly function
-    _dwc_optimized_func.run();
-
-    // Permute output
-    if(_is_nchw)
-    {
-        _permute_output.run();
-    }
-}
-
-void NEDepthwiseConvolutionLayer3x3::run()
-{
-    prepare();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Permute input
-    if(_permute)
-    {
-        _permute_input.run();
-    }
-
-    _is_optimized ? run_optimized() : run_generic();
-
-    // Run activation
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.run();
-    }
-}
-
-void NEDepthwiseConvolutionLayer3x3::prepare()
-{
-    if(!_is_prepared)
-    {
-        // Permute weights
-        if(_permute)
-        {
-            _permuted_weights.allocator()->allocate();
-            _permute_weights.run();
-            _original_weights->mark_as_unused();
-        }
-
-        // Prepare optimized function
-        if(_is_optimized)
-        {
-            _dwc_optimized_func.prepare();
-            if(!_permuted_weights.is_used())
-            {
-                _permuted_weights.allocator()->free();
-            }
-        }
-
-        _is_prepared = true;
-    }
-}
-
-NEDepthwiseConvolutionLayerOptimized::NEDepthwiseConvolutionLayerOptimized(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _dwc_kernel(), _dwc_optimized_func(memory_manager), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(),
-      _activationlayer_function(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_optimized(false),
-      _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
-{
-}
-
-void NEDepthwiseConvolutionLayerOptimized::configure_generic(ITensor                   *input,
-                                                             const ITensor             *weights,
-                                                             const ITensor             *biases,
-                                                             ITensor                   *output,
-                                                             const PadStrideInfo       &conv_info,
-                                                             unsigned int               depth_multiplier,
-                                                             const ActivationLayerInfo &act_info,
-                                                             const Size2D              &dilation)
-{
-    ARM_COMPUTE_UNUSED(act_info);
-
-    PixelValue zero_value(0.f);
-
-    // Initialize the intermediate accumulator tensor in case of quantized input
-    if(_is_quantized)
-    {
-        TensorShape accum_shape  = output->info()->tensor_shape();
-        DataLayout  accum_layout = output->info()->data_layout();
-        if(!_is_nchw)
-        {
-            permute(accum_shape, PermutationVector(1U, 2U, 0U));
-            accum_layout = DataLayout::NCHW;
-        }
-
-        _memory_group.manage(&_accumulator);
-        _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info()));
-        _accumulator.info()->set_data_layout(accum_layout);
-        zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().uniform().offset));
-    }
-
-    if(!_is_nchw)
-    {
-        _memory_group.manage(&_permuted_input);
-        _memory_group.manage(&_permuted_output);
-
-        // Configure the function to transform the input tensor from NHWC -> NCHW
-        _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
-        _permuted_input.info()->set_data_layout(DataLayout::NCHW);
-
-        // Configure the function to transform the weights tensor from HWI -> IHW
-        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
-        _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
-        _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
-
-        // Configure depthwise
-        _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier, dilation);
-
-        // Configure border handler
-        _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
-
-        // Allocate tensors
-        _permuted_input.allocator()->allocate();
-    }
-    else
-    {
-        // Configure depthwise convolution kernel
-        _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier, dilation);
-
-        // Configure border handler
-        _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
-    }
-
-    // Configure biases accumulation
-    if(_is_quantized)
-    {
-        const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
-        const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = (output->info()->total_size() == 0) ? iq_info : output->info()->quantization_info().uniform();
-
-        float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
-        int   output_multiplier;
-        int   output_shift;
-        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-        _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, oq_info.offset);
-        _accumulator.allocator()->allocate();
-    }
-    else if(_has_bias)
-    {
-        _output_stage_kernel.configure(_is_nchw ? output : &_permuted_output, biases);
-    }
-
-    // Permute output
-    if(!_is_nchw)
-    {
-        // Configure the function to transform the convoluted output to NHWC
-        _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
-        _permuted_output.allocator()->allocate();
-    }
-}
-
-void NEDepthwiseConvolutionLayerOptimized::configure_optimized(const ITensor             *input,
-                                                               const ITensor             *weights,
-                                                               const ITensor             *biases,
-                                                               ITensor                   *output,
-                                                               const PadStrideInfo       &conv_info,
-                                                               unsigned int               depth_multiplier,
-                                                               const ActivationLayerInfo &act_info,
-                                                               const Size2D              &dilation)
-{
-    ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
-    const bool          is_relu         = arm_compute::utils::info_helpers::is_relu(act_info);
-    const bool          is_relu6        = arm_compute::utils::info_helpers::is_relu6(act_info);
-    _is_activationlayer_enabled         = act_info.enabled() && !(is_relu || is_relu6);
-    if(!_is_activationlayer_enabled)
-    {
-        act_info_to_use = act_info;
-    }
-
-    if(_is_nchw)
-    {
-        _memory_group.manage(&_permuted_input);
-        _memory_group.manage(&_permuted_output);
-
-        // Configure the function to transform the input tensor from NCHW -> NHWC
-        _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
-        _permuted_input.info()->set_data_layout(DataLayout::NHWC);
-
-        // Configure the function to transform the weights tensor from IHW -> HWI
-        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
-        _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
-
-        _permuted_output.info()->set_data_layout(DataLayout::NHWC);
-        _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
-
-        // Configure optimized depthwise
-        _dwc_optimized_func.configure(&_permuted_input, &_permuted_weights, biases, &_permuted_output, conv_info, depth_multiplier, act_info_to_use, dilation);
-
-        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
-        _permuted_output.info()->set_data_layout(DataLayout::NHWC);
-        _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
-
-        // Allocate tensors
-        _permuted_input.allocator()->allocate();
-        _permuted_output.allocator()->allocate();
-    }
-    else
-    {
-        _dwc_optimized_func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info_to_use, dilation);
-    }
-}
-
-void NEDepthwiseConvolutionLayerOptimized::configure(ITensor       *input,
-                                                     const ITensor *weights,
-                                                     const ITensor *biases,
-                                                     ITensor *output, const PadStrideInfo &conv_info,
-                                                     unsigned int               depth_multiplier,
-                                                     const ActivationLayerInfo &act_info,
-                                                     const Size2D              &dilation)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayerOptimized::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
-                                                                              output->info(), conv_info, depth_multiplier, act_info, dilation));
-
-    _original_weights = weights;
-    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _has_bias         = biases != nullptr;
-    _is_optimized     = NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input->info(),
-                                                                                       weights->info(),
-                                                                                       conv_info,
-                                                                                       depth_multiplier,
-                                                                                       dilation);
-    _is_nchw                    = input->info()->data_layout() == DataLayout::NCHW;
-    _permute                    = _is_optimized == _is_nchw;
-    _is_prepared                = false;
-    _is_activationlayer_enabled = act_info.enabled();
-
-    // Configure appropriate pipeline
-    if(_is_optimized)
-    {
-        configure_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-    }
-    else
-    {
-        configure_generic(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-    }
-
-    // Configure activation
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.configure(output, nullptr, act_info);
-    }
-}
-
-Status NEDepthwiseConvolutionLayerOptimized::validate(const ITensorInfo         *input,
-                                                      const ITensorInfo         *weights,
-                                                      const ITensorInfo         *biases,
-                                                      const ITensorInfo         *output,
-                                                      const PadStrideInfo       &conv_info,
-                                                      unsigned int               depth_multiplier,
-                                                      const ActivationLayerInfo &act_info,
-                                                      const Size2D              &dilation)
+Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                    unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -619,8 +91,244 @@
 
     return Status{};
 }
+} // namespace
 
-void NEDepthwiseConvolutionLayerOptimized::run_generic()
+NEDepthwiseConvolutionLayerOptimized::NEDepthwiseConvolutionLayerOptimized(std::shared_ptr<IMemoryManager> memory_manager)
+    : _func(std::move(memory_manager))
+{
+}
+
+void NEDepthwiseConvolutionLayerOptimized::configure(ITensor       *input,
+                                                     const ITensor *weights,
+                                                     const ITensor *biases,
+                                                     ITensor *output, const PadStrideInfo &conv_info,
+                                                     unsigned int               depth_multiplier,
+                                                     const ActivationLayerInfo &act_info,
+                                                     const Size2D              &dilation)
+{
+    _func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+}
+
+Status NEDepthwiseConvolutionLayerOptimized::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                                      unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+{
+    return validate_arguments_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+}
+
+void NEDepthwiseConvolutionLayerOptimized::run()
+{
+    _func.run();
+}
+
+void NEDepthwiseConvolutionLayerOptimized::prepare()
+{
+    _func.prepare();
+}
+
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager), _dwc_kernel(), _dwc_optimized_func(memory_manager), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(),
+      _activationlayer_function(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_optimized(false),
+      _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
+{
+}
+
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure_generic(ITensor                   *input,
+                                                                                                  const ITensor             *weights,
+                                                                                                  const ITensor             *biases,
+                                                                                                  ITensor                   *output,
+                                                                                                  const PadStrideInfo       &conv_info,
+                                                                                                  unsigned int               depth_multiplier,
+                                                                                                  const ActivationLayerInfo &act_info,
+                                                                                                  const Size2D              &dilation)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+
+    PixelValue zero_value(0.f);
+
+    // Initialize the intermediate accumulator tensor in case of quantized input
+    if(_is_quantized)
+    {
+        TensorShape accum_shape  = output->info()->tensor_shape();
+        DataLayout  accum_layout = output->info()->data_layout();
+        if(!_is_nchw)
+        {
+            permute(accum_shape, PermutationVector(1U, 2U, 0U));
+            accum_layout = DataLayout::NCHW;
+        }
+
+        _memory_group.manage(&_accumulator);
+        _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info()));
+        _accumulator.info()->set_data_layout(accum_layout);
+        zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().uniform().offset));
+    }
+
+    if(!_is_nchw)
+    {
+        _memory_group.manage(&_permuted_input);
+        _memory_group.manage(&_permuted_output);
+
+        // Configure the function to transform the input tensor from NHWC -> NCHW
+        _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+        _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+        // Configure the function to transform the weights tensor from HWI -> IHW
+        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+        _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+        _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
+
+        // Configure depthwise
+        _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier, dilation);
+
+        // Configure border handler
+        _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+
+        // Allocate tensors
+        _permuted_input.allocator()->allocate();
+    }
+    else
+    {
+        // Configure depthwise convolution kernel
+        _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier, dilation);
+
+        // Configure border handler
+        _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+    }
+
+    // Configure biases accumulation
+    if(_is_quantized)
+    {
+        const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
+        const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = (output->info()->total_size() == 0) ? iq_info : output->info()->quantization_info().uniform();
+
+        float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
+        int   output_multiplier;
+        int   output_shift;
+        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+        _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, oq_info.offset);
+        _accumulator.allocator()->allocate();
+    }
+    else if(_has_bias)
+    {
+        _output_stage_kernel.configure(_is_nchw ? output : &_permuted_output, biases);
+    }
+
+    // Permute output
+    if(!_is_nchw)
+    {
+        // Configure the function to transform the convoluted output to NHWC
+        _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+        _permuted_output.allocator()->allocate();
+    }
+}
+
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure_optimized(const ITensor             *input,
+                                                                                                    const ITensor             *weights,
+                                                                                                    const ITensor             *biases,
+                                                                                                    ITensor                   *output,
+                                                                                                    const PadStrideInfo       &conv_info,
+                                                                                                    unsigned int               depth_multiplier,
+                                                                                                    const ActivationLayerInfo &act_info,
+                                                                                                    const Size2D              &dilation)
+{
+    ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
+    const bool          is_relu         = arm_compute::utils::info_helpers::is_relu(act_info);
+    const bool          is_relu6        = arm_compute::utils::info_helpers::is_relu6(act_info);
+    _is_activationlayer_enabled         = act_info.enabled() && !(is_relu || is_relu6);
+    if(!_is_activationlayer_enabled)
+    {
+        act_info_to_use = act_info;
+    }
+
+    if(_is_nchw)
+    {
+        _memory_group.manage(&_permuted_input);
+        _memory_group.manage(&_permuted_output);
+
+        // Configure the function to transform the input tensor from NCHW -> NHWC
+        _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+        _permuted_input.info()->set_data_layout(DataLayout::NHWC);
+
+        // Configure the function to transform the weights tensor from IHW -> HWI
+        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+        _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+
+        _permuted_output.info()->set_data_layout(DataLayout::NHWC);
+        _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
+
+        // Configure optimized depthwise
+        _dwc_optimized_func.configure(&_permuted_input, &_permuted_weights, biases, &_permuted_output, conv_info, depth_multiplier, act_info_to_use, dilation);
+
+        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+        _permuted_output.info()->set_data_layout(DataLayout::NHWC);
+        _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
+
+        // Allocate tensors
+        _permuted_input.allocator()->allocate();
+        _permuted_output.allocator()->allocate();
+    }
+    else
+    {
+        _dwc_optimized_func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info_to_use, dilation);
+    }
+}
+
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(ITensor       *input,
+                                                                                          const ITensor *weights,
+                                                                                          const ITensor *biases,
+                                                                                          ITensor *output, const PadStrideInfo &conv_info,
+                                                                                          unsigned int               depth_multiplier,
+                                                                                          const ActivationLayerInfo &act_info,
+                                                                                          const Size2D              &dilation)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayerOptimizedInternal::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
+                                                                                      output->info(), conv_info, depth_multiplier, act_info, dilation));
+
+    _original_weights = weights;
+    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _has_bias         = biases != nullptr;
+    _is_optimized     = NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input->info(),
+                                                                                       weights->info(),
+                                                                                       conv_info,
+                                                                                       depth_multiplier,
+                                                                                       dilation);
+    _is_nchw                    = input->info()->data_layout() == DataLayout::NCHW;
+    _permute                    = _is_optimized == _is_nchw;
+    _is_prepared                = false;
+    _is_activationlayer_enabled = act_info.enabled();
+
+    // Configure appropriate pipeline
+    if(_is_optimized)
+    {
+        configure_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+    }
+    else
+    {
+        configure_generic(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+    }
+
+    // Configure activation
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.configure(output, nullptr, act_info);
+    }
+}
+
+Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo         *input,
+                                                                                           const ITensorInfo         *weights,
+                                                                                           const ITensorInfo         *biases,
+                                                                                           const ITensorInfo         *output,
+                                                                                           const PadStrideInfo       &conv_info,
+                                                                                           unsigned int               depth_multiplier,
+                                                                                           const ActivationLayerInfo &act_info,
+                                                                                           const Size2D              &dilation)
+{
+    return validate_arguments_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+}
+
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run_generic()
 {
     // Fill border
     NEScheduler::get().schedule(&_border_handler, Window::DimX);
@@ -641,7 +349,7 @@
     }
 }
 
-void NEDepthwiseConvolutionLayerOptimized::run_optimized()
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run_optimized()
 {
     // Run assembly function
     _dwc_optimized_func.run();
@@ -653,7 +361,7 @@
     }
 }
 
-void NEDepthwiseConvolutionLayerOptimized::run()
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run()
 {
     prepare();
 
@@ -674,7 +382,7 @@
     }
 }
 
-void NEDepthwiseConvolutionLayerOptimized::prepare()
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::prepare()
 {
     if(!_is_prepared)
     {
@@ -700,262 +408,84 @@
     }
 }
 
-NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
-    : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _depthwise_conv_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _fill_border(), _v2mm_input_fill_border(),
-      _v2mm_weights_fill_border(), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(),
-      _permuted_input(), _permuted_weights(), _permuted_output(), _is_prepared(false), _is_quantized(false), _is_nhwc(false), _is_activationlayer_enabled(false), _is_optimized(false),
-      _original_weights(nullptr)
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric()
+    : _depthwise_conv_kernel(), _fill_border(), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _permuted_input(), _permuted_weights(), _permuted_output(),
+      _is_prepared(false), _is_nchw(false), _is_activationlayer_enabled(false), _original_weights(nullptr)
 {
 }
 
-void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                            unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+                                                                                unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
                                                                      output->info(), conv_info, depth_multiplier, act_info, dilation));
 
-    _is_nhwc      = input->info()->data_layout() == DataLayout::NHWC;
-    _is_optimized = _is_nhwc && input->info()->data_type() == DataType::F32;
+    _is_nchw     = input->info()->data_layout() == DataLayout::NCHW;
+    _is_prepared = !_is_nchw;
 
-    if(!_is_optimized)
+    ITensor       *input_to_use   = input;
+    const ITensor *weights_to_use = weights;
+    ITensor       *output_to_use  = output;
+    if(_is_nchw)
     {
-        ITensor       *input_to_use   = input;
-        const ITensor *weights_to_use = weights;
-        ITensor       *output_to_use  = output;
+        _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+        _permuted_input.info()->set_data_layout(DataLayout::NHWC);
+        input_to_use = &_permuted_input;
 
-        if(_is_nhwc)
-        {
-            _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
-            _permuted_input.info()->set_data_layout(DataLayout::NCHW);
-            input_to_use = &_permuted_input;
+        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+        _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+        weights_to_use = &_permuted_weights;
 
-            _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
-            _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
-            weights_to_use = &_permuted_weights;
-        }
-
-        const size_t weights_w = weights_to_use->info()->dimension(0);
-        const size_t weights_h = weights_to_use->info()->dimension(1);
-        const size_t weights_z = weights_to_use->info()->dimension(2);
-
-        _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
-        _is_prepared      = false;
-        _original_weights = weights_to_use;
-
-        // Should bias be appended ?
-        bool append_bias = (biases != nullptr) && !_is_quantized;
-
-        // Calculate output shape
-        TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
-
-        // Output auto inizialitation if not yet initialized
-        auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-
-        if(_is_nhwc)
-        {
-            permute(output_shape, PermutationVector(1U, 2U, 0U));
-            _permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
-            _permuted_output.info()->set_data_layout(DataLayout::NCHW);
-            _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
-            output_to_use = &_permuted_output;
-        }
-
-        // Output width and height
-        const unsigned int conv_w = output_shape.x();
-        const unsigned int conv_h = output_shape.y();
-
-        // Set up intermediate tensors
-        const size_t patch_size = weights_w * weights_h + (append_bias ? 1 : 0);
-        const size_t conv_size  = conv_w * conv_h;
-
-        // Im2Col configuration
-        TensorShape shape_im2col = input_to_use->info()->tensor_shape();
-        shape_im2col.set(0, patch_size);
-        shape_im2col.set(1, conv_size);
-        shape_im2col.set(2, weights_z);
-        _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
-        _im2col_kernel.configure(input_to_use, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation);
-
-        // Weights reshape configuration
-        const TensorShape shape_weights_reshape(patch_size, weights_z);
-        _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape).set_data_layout(DataLayout::NCHW));
-        _weights_reshape_kernel.configure(weights_to_use, &_weights_reshaped, append_bias ? biases : nullptr);
-
-        // GEMV configuration
-        DataType    v2mm_dt        = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
-        TensorShape shape_v2mm_out = input_to_use->info()->tensor_shape();
-        shape_v2mm_out.set(0, conv_size * weights_z);
-        shape_v2mm_out.set(1, 1);
-        shape_v2mm_out.set(2, 1);
-        _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out).set_data_layout(DataLayout::NCHW));
-        _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
-        _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
-        _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output_to_use, conv_w, conv_h);
-
-        // Output staged configuration
-        if(_is_quantized)
-        {
-            const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
-            const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
-            const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
-
-            float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
-            int   output_multiplier;
-            int   output_shift;
-            quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-            _output_stage_kernel.configure(&_output_reshaped, biases, output_to_use, output_multiplier, output_shift, oq_info.offset);
-            _output_reshaped.allocator()->allocate();
-        }
-
-        if(_is_nhwc)
-        {
-            _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
-
-            _permuted_input.allocator()->allocate();
-            _permuted_weights.allocator()->allocate();
-            _permuted_output.allocator()->allocate();
-        }
-
-        // Fill borders on inputs
-        PixelValue zero_in(static_cast<int32_t>(0));
-        PixelValue zero_w(static_cast<int32_t>(0));
-        if(_is_quantized)
-        {
-            zero_in = PixelValue(static_cast<int32_t>(input->info()->quantization_info().uniform().offset));
-            zero_w  = PixelValue(static_cast<int32_t>(weights->info()->quantization_info().uniform().offset));
-        }
-        BorderSize border_size = _v2mm_kernel.border_size();
-        _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, zero_in);
-
-        border_size.bottom = 0;
-        _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, zero_w);
-
-        // Allocate intermediate tensors
-        _input_reshaped.allocator()->allocate();
-        _v2mm_output.allocator()->allocate();
+        _permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
+        output_to_use = &_permuted_output;
     }
-    else
-    {
-        // Configure kernel
-        _depthwise_conv_kernel.configure(input, weights, biases, output, conv_info, depth_multiplier, dilation);
+    _original_weights = weights_to_use;
 
-        // Fill input borders
-        _fill_border.configure(input, _depthwise_conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint64_t>(0), input->info()->data_type()));
+    _depthwise_conv_kernel.configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, dilation);
+    _fill_border.configure(input_to_use, _depthwise_conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()));
+
+    if(_is_nchw)
+    {
+        _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
+        _permuted_output.info()->set_data_layout(DataLayout::NHWC);
+
+        _permuted_input.allocator()->allocate();
+        _permuted_weights.allocator()->allocate();
+        _permuted_output.allocator()->allocate();
     }
 
     //Configure Activation Layer
     _is_activationlayer_enabled = act_info.enabled();
-
     if(_is_activationlayer_enabled)
     {
         _activationlayer_function.configure(output, nullptr, act_info);
     }
 }
 
-Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                             unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+                                                                                 const PadStrideInfo &conv_info,
+                                                                                 unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
-
-    const unsigned int width_idx   = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) + (weights->dimension(width_idx) - 1) * (dilation.x() - 1) > input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(height_idx) + (weights->dimension(height_idx) - 1) * (dilation.y() - 1) > input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom());
-    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(channel_idx) * depth_multiplier) != weights->dimension(channel_idx));
-
-    if(input->data_layout() != DataLayout::NHWC || input->data_type() != DataType::F32)
+    if(input->data_layout() == DataLayout::NCHW)
     {
-        // Clone output to use auto init
-        auto output_clone = output->clone();
-
-        const ITensorInfo *input_to_use   = input;
-        const ITensorInfo *weights_to_use = weights;
-        const ITensorInfo *output_to_use  = output_clone.get();
-
         TensorShape permuted_input_shape   = input->tensor_shape();
         TensorShape permuted_weights_shape = weights->tensor_shape();
-        TensorInfo  permuted_input;
-        TensorInfo  permuted_weights;
+        TensorShape permuted_output_shape  = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+        permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
+        permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
+        permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
 
-        if(input->data_layout() == DataLayout::NHWC)
-        {
-            permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
-            permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
+        const TensorInfo permuted_input   = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC));
+        const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC));
+        const TensorInfo permuted_output  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW));
 
-            permuted_input   = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NCHW));
-            permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U)));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U)));
 
-            input_to_use   = &permuted_input;
-            weights_to_use = &permuted_weights;
-        }
-
-        const bool         is_quantized = is_data_type_quantized_asymmetric(input->data_type());
-        const bool         append_bias  = (biases != nullptr) && !is_quantized;
-        TensorShape        output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
-        const size_t       weights_w    = weights_to_use->dimension(0);
-        const size_t       weights_h    = weights_to_use->dimension(1);
-        const size_t       weights_z    = weights_to_use->dimension(2);
-        const unsigned int conv_w       = output_shape[width_idx];
-        const unsigned int conv_h       = output_shape[height_idx];
-        const size_t       patch_size   = weights_w * weights_h + (append_bias ? 1 : 0);
-        const size_t       conv_size    = conv_w * conv_h;
-
-        // Output auto inizialitation if not yet initialized
-        auto_init_if_empty(*output_clone, input->clone()->set_tensor_shape(output_shape));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-
-        TensorInfo permuted_output;
-        if(input->data_layout() == DataLayout::NHWC)
-        {
-            permute(output_shape, PermutationVector(1U, 2U, 0U));
-            permuted_output = TensorInfo(output_clone->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_layout(DataLayout::NCHW));
-            output_to_use   = &permuted_output;
-        }
-
-        // Im2Col configuration
-        TensorShape shape_im2col = input_to_use->tensor_shape();
-        shape_im2col.set(0, patch_size);
-        shape_im2col.set(1, conv_size);
-        shape_im2col.set(2, weights_z);
-        TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseIm2ColKernel::validate(input_to_use, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation));
-
-        // Weights reshape configuration
-        const TensorShape shape_weights_reshape(patch_size, weights_z);
-        TensorInfo        weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape).set_data_layout(DataLayout::NCHW));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseWeightsReshapeKernel::validate(weights_to_use, &weights_reshaped, append_bias ? biases : nullptr));
-
-        // GEMV configuration
-        DataType    v2mm_dt        = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
-        TensorShape shape_v2mm_out = input_to_use->tensor_shape();
-        shape_v2mm_out.set(0, conv_size * weights_z);
-        shape_v2mm_out.set(1, 1);
-        shape_v2mm_out.set(2, 1);
-        TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out).set_data_layout(DataLayout::NCHW));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
-
-        TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_to_use->tensor_shape()));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output_to_use, conv_w, conv_h));
-
-        if(is_quantized)
-        {
-            const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
-            const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
-            const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
-
-            float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
-            int   output_multiplier;
-            int   output_shift;
-            ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output_to_use, output_multiplier, output_shift, oq_info.offset));
-        }
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, dilation));
     }
     else
     {
@@ -971,35 +501,20 @@
     return Status{};
 }
 
-void NEDepthwiseConvolutionLayer::run()
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::run()
 {
-    if(!_is_optimized)
+    if(_is_nchw)
     {
         prepare();
-
-        if(_is_nhwc)
-        {
-            _permute_input.run();
-        }
-
-        NEScheduler::get().schedule(&_im2col_kernel, Window::DimX);
-        NEScheduler::get().schedule(&_v2mm_input_fill_border, Window::DimX);
-        NEScheduler::get().schedule(&_v2mm_kernel, Window::DimX);
-        NEScheduler::get().schedule(&_vector_to_tensor_kernel, Window::DimX);
-        if(_is_quantized)
-        {
-            NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
-        }
-
-        if(_is_nhwc)
-        {
-            _permute_output.run();
-        }
+        _permute_input.run();
     }
-    else
+
+    NEScheduler::get().schedule(&_fill_border, Window::DimX);
+    NEScheduler::get().schedule(&_depthwise_conv_kernel, Window::DimY);
+
+    if(_is_nchw)
     {
-        NEScheduler::get().schedule(&_fill_border, Window::DimX);
-        NEScheduler::get().schedule(&_depthwise_conv_kernel, Window::DimY);
+        _permute_output.run();
     }
 
     if(_is_activationlayer_enabled)
@@ -1008,24 +523,98 @@
     }
 }
 
-void NEDepthwiseConvolutionLayer::prepare()
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::prepare()
 {
-    if(!_is_prepared && !_is_optimized)
+    if(!_is_prepared)
     {
         ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
-        if(_is_nhwc)
-        {
-            _permute_weights.run();
-        }
-
-        // Run reshape and mark original weights as unused
-        _weights_reshaped.allocator()->allocate();
-        NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
-        NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
+        _permute_weights.run();
         _original_weights->mark_as_unused();
-
         _is_prepared = true;
     }
 }
+
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_optimized(std::move(memory_manager)), _func_generic()
+{
+}
+
+void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+                                            const ActivationLayerInfo &act_info, const Size2D &dilation)
+{
+    _depth_conv_func = get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info, dilation);
+    switch(_depth_conv_func)
+    {
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            _func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+            break;
+        case DepthwiseConvolutionFunction::GENERIC:
+            _func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
+    }
+}
+
+Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                             unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+{
+    DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+    switch(depth_conv_func)
+    {
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            return NEDepthwiseConvolutionLayerOptimized::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+            break;
+        case DepthwiseConvolutionFunction::GENERIC:
+            return NEDepthwiseConvolutionLayerGeneric::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
+    }
+}
+
+DepthwiseConvolutionFunction NEDepthwiseConvolutionLayer::get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+                                                                                            const PadStrideInfo &conv_info,
+                                                                                            unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
+{
+    if(bool(NEDepthwiseConvolutionLayerOptimized::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation)))
+    {
+        return DepthwiseConvolutionFunction::OPTIMIZED;
+    }
+    else
+    {
+        return DepthwiseConvolutionFunction::GENERIC;
+    }
+}
+
+void NEDepthwiseConvolutionLayer::run()
+{
+    switch(_depth_conv_func)
+    {
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            _func_optimized.run();
+            break;
+        case DepthwiseConvolutionFunction::GENERIC:
+            _func_generic.run();
+            break;
+        default:
+            ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
+    }
+}
+
+void NEDepthwiseConvolutionLayer::prepare()
+{
+    switch(_depth_conv_func)
+    {
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            _func_optimized.prepare();
+            break;
+        case DepthwiseConvolutionFunction::GENERIC:
+            _func_generic.prepare();
+            break;
+        default:
+            ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
+    }
+}
 } // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp
deleted file mode 100644
index da2e49c..0000000
--- a/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp
+++ /dev/null

@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-NEDepthwiseSeparableConvolutionLayer::NEDepthwiseSeparableConvolutionLayer()
-    : _depthwise_conv(), _pointwise_conv()
-{
-}
-
-void NEDepthwiseSeparableConvolutionLayer::configure(ITensor *input, const ITensor *depthwise_weights, const ITensor *depthwise_biases, ITensor *depthwise_out,
-                                                     const ITensor *pointwise_weights, const ITensor *pointwise_biases, ITensor *output,
-                                                     const PadStrideInfo &depthwise_conv_info, const PadStrideInfo &pointwise_conv_info)
-{
-    _depthwise_conv.configure(input, depthwise_weights, depthwise_biases, depthwise_out, depthwise_conv_info);
-    _pointwise_conv.configure(depthwise_out, pointwise_weights, pointwise_biases, output, pointwise_conv_info);
-}
-
-void NEDepthwiseSeparableConvolutionLayer::run()
-{
-    prepare();
-
-    _depthwise_conv.run();
-    _pointwise_conv.run();
-}
-
-void NEDepthwiseSeparableConvolutionLayer::prepare()
-{
-    _depthwise_conv.prepare();
-    _pointwise_conv.prepare();
-}
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp
new file mode 100644
index 0000000..d1d1343
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp

@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
+
+#include <cstddef>
+#include <ios>
+#include <list>
+
+namespace arm_compute
+{
+NEDetectionPostProcessLayer::NEDetectionPostProcessLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _dequantize(), _detection_post_process(), _decoded_scores(), _run_dequantize(false)
+{
+}
+
+void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, const ITensor *input_scores, const ITensor *input_anchors,
+                                            ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores);
+    ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate(input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), output_classes->info(),
+                                                                     output_scores->info(),
+                                                                     num_detection->info(), info));
+
+    const ITensor                *input_scores_to_use = input_scores;
+    DetectionPostProcessLayerInfo info_to_use         = info;
+    _run_dequantize                                   = is_data_type_quantized(input_box_encoding->info()->data_type());
+
+    if(_run_dequantize)
+    {
+        _memory_group.manage(&_decoded_scores);
+
+        _dequantize.configure(input_scores, &_decoded_scores);
+
+        input_scores_to_use = &_decoded_scores;
+
+        // Create a new info struct to avoid dequantizing in the CPP layer
+        std::array<float, 4> scales_values{ info.scale_value_y(), info.scale_value_x(), info.scale_value_h(), info.scale_value_w() };
+        DetectionPostProcessLayerInfo info_quantized(info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(), info.num_classes(),
+                                                     scales_values, info.use_regular_nms(), info.detection_per_class(), false);
+        info_to_use = info_quantized;
+    }
+
+    _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes, output_classes, output_scores, num_detection, info_to_use);
+    _decoded_scores.allocator()->allocate();
+}
+
+Status NEDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_scores, const ITensorInfo *input_anchors,
+                                             ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, DetectionPostProcessLayerInfo info)
+{
+    bool run_dequantize = is_data_type_quantized(input_box_encoding->data_type());
+    if(run_dequantize)
+    {
+        TensorInfo decoded_classes_info = input_scores->clone()->set_is_resizable(true).set_data_type(DataType::F32);
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(input_scores, &decoded_classes_info));
+    }
+    ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, num_detection, info));
+
+    return Status{};
+}
+
+void NEDetectionPostProcessLayer::run()
+{
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    // Decode scores if necessary
+    if(_run_dequantize)
+    {
+        _dequantize.run();
+    }
+    _detection_post_process.run();
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 12a5a1d..ee622f4 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp

@@ -74,10 +74,11 @@
     return NETransposeKernel::validate(input, output);
 }
 
-NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(), _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(), _accumulate_biases_kernel(),
-      _flatten_output(), _gemmlowp_output(), _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr), _are_weights_converted(true), _are_weights_reshaped(false),
-      _is_fc_after_conv(false), _accumulate_biases(false), _is_quantized(false), _is_prepared(false)
+NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+    : _memory_group(std::move(memory_manager)), _weights_manager(weights_manager), _flatten_kernel(), _convert_weights(), _convert_weights_managed(), _reshape_weights_function(),
+      _reshape_weights_managed_function(), _mm_gemm(nullptr, weights_manager), _mm_gemmlowp(), _gemmlowp_output_stage(), _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(),
+      _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr), _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false), _accumulate_biases(false),
+      _is_quantized(false), _is_prepared(false)
 {
 }
 
@@ -155,6 +156,11 @@
     _is_quantized          = is_data_type_quantized_asymmetric(input->info()->data_type());
     _original_weights      = weights;
 
+    if(_weights_manager)
+    {
+        _weights_manager->manage(weights);
+    }
+
     // Configure gemmlowp output
     if(_is_quantized)
     {
@@ -194,21 +200,39 @@
     // Reshape weights if needed
     if(!_are_weights_reshaped)
     {
-        // Reshape the weights
-        _reshape_weights_function.configure(weights, &_reshape_weights_output);
-        weights_to_use = &_reshape_weights_output;
+        if(_weights_manager && _weights_manager->are_weights_managed(weights))
+        {
+            _reshape_weights_managed_function.configure(weights);
+            weights_to_use = _weights_manager->acquire(weights, &_reshape_weights_managed_function);
+        }
+        else
+        {
+            // Reshape the weights
+            _reshape_weights_function.configure(weights, &_reshape_weights_output);
+            weights_to_use = &_reshape_weights_output;
+        }
     }
 
     // Convert weights if needed
     if(_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout))
     {
-        // Convert weights
-        _convert_weights.configure(weights_to_use,
-                                   &_converted_weights_output,
-                                   input->info()->tensor_shape(),
-                                   fc_info.weights_trained_layout);
+        if(_weights_manager && _weights_manager->are_weights_managed(weights_to_use))
+        {
+            _convert_weights_managed.configure(weights_to_use,
+                                               input->info()->tensor_shape(),
+                                               fc_info.weights_trained_layout);
+            weights_to_use = _weights_manager->acquire(weights, &_convert_weights_managed);
+        }
+        else
+        {
+            // Convert weights
+            _convert_weights.configure(weights_to_use,
+                                       &_converted_weights_output,
+                                       input->info()->tensor_shape(),
+                                       fc_info.weights_trained_layout);
 
-        weights_to_use         = &_converted_weights_output;
+            weights_to_use = &_converted_weights_output;
+        }
         _are_weights_converted = false;
     }
 
@@ -381,7 +405,10 @@
 {
     if(!_is_prepared)
     {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+        if(!_weights_manager)
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+        }
 
         auto release_unused = [](Tensor * w)
         {
@@ -397,22 +424,39 @@
         // Reshape of the weights (happens only once)
         if(!_are_weights_reshaped)
         {
-            // Run reshape weights kernel and mark weights as unused
-            _reshape_weights_output.allocator()->allocate();
-            _reshape_weights_function.run();
-
-            cur_weights->mark_as_unused();
-            cur_weights           = &_reshape_weights_output;
+            if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
+            {
+                cur_weights = _weights_manager->run(cur_weights, &_reshape_weights_managed_function);
+            }
+            else
+            {
+                // Reshape of the weights (happens only once)
+                if(!_are_weights_reshaped)
+                {
+                    // Run reshape weights kernel and mark weights as unused
+                    _reshape_weights_output.allocator()->allocate();
+                    _reshape_weights_function.run();
+                }
+                cur_weights->mark_as_unused();
+                cur_weights = &_reshape_weights_output;
+            }
             _are_weights_reshaped = true;
         }
 
         // Convert weights if needed (happens only once)
         if(!_are_weights_converted)
         {
-            _converted_weights_output.allocator()->allocate();
-            _convert_weights.run();
+            if(_weights_manager && _weights_manager->are_weights_managed(cur_weights))
+            {
+                _weights_manager->run(cur_weights, &_convert_weights_managed);
+            }
+            else
+            {
+                _converted_weights_output.allocator()->allocate();
+                _convert_weights.run();
+                cur_weights->mark_as_unused();
+            }
 
-            cur_weights->mark_as_unused();
             _are_weights_converted = true;
         }
 

diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 37d0e09..baa22b7 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp

@@ -34,7 +34,6 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
 #include "arm_compute/runtime/TensorAllocator.h"
-#include "support/ToolchainSupport.h"
 
 #include <cmath>
 
@@ -42,9 +41,10 @@
 
 namespace arm_compute
 {
-NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(memory_manager), _ma_kernel(), _tmp_a(), _tmp_b(), _original_b(nullptr),
-      _run_vector_matrix_multiplication(false), _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
+NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+    : _memory_group(memory_manager), _weights_manager(weights_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(memory_manager, weights_manager), _ma_kernel(),
+      _alpha_scale_func(nullptr), _add_bias_kernel(), _activation_func(), _tmp_a(), _tmp_b(), _tmp_d(), _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_alpha_scale(false),
+      _run_addition(false), _run_bias_addition(false), _run_activation(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
 {
 }
 
@@ -52,34 +52,55 @@
 {
     ARM_COMPUTE_ERROR_THROW_ON(NEGEMM::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info));
 
+    const bool is_c_bias     = gemm_info.reshape_b_only_on_first_run();
+    bool       run_optimised = bool(NEGEMMAssemblyDispatch::validate(a->info(), b->info(), (is_c_bias && c != nullptr) ? c->info() : nullptr, d->info(), gemm_info));
+
     // Check if we need to reshape the matrix B only on the first run
     _is_prepared                      = false;
     _reshape_b_only_on_first_run      = gemm_info.reshape_b_only_on_first_run();
     _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
     _original_b                       = b;
-
-    bool run_optimised = c == nullptr && bool(NEGEMMAssemblyDispatch::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, d->info(), alpha, beta, gemm_info));
+    _run_alpha_scale                  = alpha != 1.f;
+    _run_bias_addition                = c != nullptr && gemm_info.reshape_b_only_on_first_run();
+    _run_addition                     = beta != 0 && c != nullptr && !gemm_info.reshape_b_only_on_first_run();
+    _run_activation                   = gemm_info.activation_info().enabled() && (!run_optimised || (run_optimised && !NEGEMMAssemblyDispatch::is_activation_supported(gemm_info.activation_info())));
 
     if(run_optimised)
     {
+        const ITensor *c_to_use = is_c_bias ? c : nullptr;
         if(MEMInfo::get_policy() == MemoryPolicy::MINIMIZE)
         {
             GEMMInfo gemm_info_ntb = gemm_info;
             gemm_info_ntb.set_pretranpose_B(false);
-            _asm_glue.configure(a, b, c, d, alpha, beta, gemm_info_ntb);
+            _asm_glue.configure(a, b, c_to_use, d, gemm_info_ntb);
         }
         else
         {
-            _asm_glue.configure(a, b, c, d, alpha, beta, gemm_info);
+            _asm_glue.configure(a, b, c_to_use, d, gemm_info);
         }
         ARM_COMPUTE_ERROR_ON(!_asm_glue.is_configured());
+
+        // Scale product by alpha
+        if(_run_alpha_scale)
+        {
+            _alpha_scale_func.configure(d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f));
+        }
     }
     else
     {
+        // Pick output tensor in case bias addition should be performed
+        ITensor *gemm_output_to_use = d;
+        if(_run_bias_addition)
+        {
+            gemm_output_to_use = &_tmp_d;
+            _memory_group.manage(&_tmp_d);
+        }
+
+        // Select between GEMV and GEMM
         if(_run_vector_matrix_multiplication)
         {
             // Configure the matrix multiply kernel
-            _mm_kernel.configure(a, b, d, alpha, false);
+            _mm_kernel.configure(a, b, gemm_output_to_use, alpha, false);
         }
         else
         {
@@ -117,7 +138,7 @@
             _transpose_kernel.configure(b, &_tmp_b);
 
             // Configure matrix multiplication kernel
-            _mm_kernel.configure(&_tmp_a, &_tmp_b, d, alpha, true, GEMMReshapeInfo(m, n, k));
+            _mm_kernel.configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k));
 
             // Allocate once the all configure methods have been called
             _tmp_a.allocator()->allocate();
@@ -127,18 +148,31 @@
             }
         }
 
-        // Configure matrix addition kernel
-        if(beta != 0 && c != nullptr)
+        if(_run_bias_addition)
         {
-            _ma_kernel.configure(c, d, beta);
-            _run_addition = true;
+            _add_bias_kernel.configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE);
+            _tmp_d.allocator()->allocate();
         }
     }
+
+    // Configure matrix addition kernel
+    if(_run_addition)
+    {
+        _ma_kernel.configure(c, d, beta);
+    }
+
+    // Configure activation
+    const ActivationLayerInfo &activation = gemm_info.activation_info();
+    if(_run_activation)
+    {
+        _activation_func.configure(d, nullptr, activation);
+    }
 }
 
 Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
+    const bool is_c_bias = gemm_info.reshape_b_only_on_first_run();
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F16, DataType::F32);
@@ -147,7 +181,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
 
-    if(c != nullptr)
+    if(c != nullptr && !is_c_bias)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0);
         ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d());
@@ -178,7 +212,7 @@
     }
 
     // Check if we need to run the optimized assembly kernel
-    const bool run_optimised = c == nullptr && bool(NEGEMMAssemblyDispatch::validate(a, b, c, output, alpha, beta, gemm_info));
+    const bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, output, gemm_info));
 
     if(!run_optimised)
     {
@@ -225,14 +259,26 @@
         // Validate matrix multiply
         auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
         ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
+
+        if(c != nullptr && gemm_info.reshape_b_only_on_first_run())
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&tmp_output_info, c, output, ConvertPolicy::SATURATE));
+        }
     }
 
     // Validate matrix addition kernel
-    if(beta != 0 && c != nullptr)
+    if(beta != 0 && c != nullptr && !is_c_bias)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAdditionKernel::validate(c, output, beta));
     }
 
+    // Validate activation
+    const ActivationLayerInfo &activation = gemm_info.activation_info();
+    if(activation.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, activation));
+    }
+
     return Status{};
 }
 
@@ -245,6 +291,10 @@
     if(_asm_glue.is_configured())
     {
         _asm_glue.run();
+        if(_run_alpha_scale)
+        {
+            _alpha_scale_func.run();
+        }
     }
     else
     {
@@ -262,12 +312,24 @@
 
         NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
 
-        // Run matrix addition kernel
-        if(_run_addition)
+        // Run bias addition kernel
+        if(_run_bias_addition)
         {
-            NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
+            NEScheduler::get().schedule(&_add_bias_kernel, Window::DimY);
         }
     }
+
+    // Run matrix addition kernel
+    if(_run_addition)
+    {
+        NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
+    }
+
+    // Run activation function
+    if(_run_activation)
+    {
+        _activation_func.run();
+    }
 }
 
 void NEGEMM::prepare()
@@ -276,13 +338,19 @@
     {
         if(_asm_glue.is_configured())
         {
-            ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+            if(!_weights_manager || !_weights_manager->are_weights_managed(_original_b))
+            {
+                ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+            }
 
             _asm_glue.prepare();
         }
         else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue.is_configured())
         {
-            ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+            if(!_weights_manager || !_weights_manager->are_weights_managed(_original_b))
+            {
+                ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+            }
 
             _tmp_b.allocator()->allocate();
             NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);

diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index 2a4498b..24254eb 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp

@@ -24,10 +24,8 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
 
 #include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h"
-#include "arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h"
 
 #include <arm_neon.h>
 
@@ -35,63 +33,127 @@
 {
 namespace
 {
-std::unique_ptr<IFunction> create_function_all_types(const arm_gemm::KernelDescription &gemm_kernel_info,
-                                                     const ITensor *a, const ITensor *b, ITensor *d,
-                                                     float alpha, float beta, const GEMMInfo &gemm_info,
-                                                     std::shared_ptr<IMemoryManager> memory_manager)
-
+arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
 {
-    // Note: It's safe to not check for FP16 support because this was already checked in NEGEMMAssemblyDispatch::configure()
-    switch(gemm_kernel_info.method)
+    arm_gemm::Activation gemm_act;
+
+    // Early exit in case lower bound is other than 0, as it's not yet supported
+    if(act.b() != 0.f)
     {
-        case arm_gemm::GemmMethod::GEMM_INTERLEAVED:
-        {
-            if(!gemm_info.pretranpose_B())
-            {
-                return nullptr;
-            }
-            auto function = support::cpp14::make_unique<NEGEMMInterleavedWrapper>(memory_manager);
-            function->configure(a, b, d, alpha, beta, gemm_info);
-            return std::move(function);
-        }
-#if defined(__aarch64__)
-        case arm_gemm::GemmMethod::GEMM_NATIVE:
-        {
-            if(gemm_kernel_info.name.find("sgemm_native_16x4") != std::string::npos)
-            {
-                auto kernel = support::cpp14::make_unique<NEGEMMNativeWrapperKernel<float, float>>();
-                kernel->configure(a, b, d, alpha, beta, gemm_info);
-                auto function = support::cpp14::make_unique<NESimpleAssemblyFunction>();
-                function->configure(std::move(kernel));
-                return std::move(function);
-            }
-            return nullptr;
-        }
-#endif // defined(__aarch64__)
-        default:
-            return nullptr;
+        return gemm_act;
     }
+
+    switch(act.activation())
+    {
+        case ActivationLayerInfo::ActivationFunction::RELU:
+            gemm_act.type = arm_gemm::Activation::Type::ReLU;
+            break;
+        case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+            gemm_act.type   = arm_gemm::Activation::Type::BoundedReLU;
+            gemm_act.param1 = act.a();
+            gemm_act.param2 = 0.f;
+            break;
+        case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+            gemm_act.type   = arm_gemm::Activation::Type::BoundedReLU;
+            gemm_act.param1 = act.a();
+            gemm_act.param2 = act.b();
+            break;
+        default:
+            gemm_act.type = arm_gemm::Activation::Type::None;
+    }
+
+    return gemm_act;
 }
 
+template <typename TypeInput, typename TypeOutput>
+class FallbackTransform : public ITransformWeights
+{
+public:
+    void run() override
+    {
+        _output.allocator()->allocate();
+        ARM_COMPUTE_ERROR_ON(_output.buffer() == nullptr);
+        _gemm_kernel_asm->pretranspose_B_array(_output.buffer(), _in1_ptr, _ldb, _multi_stride_b);
+        _reshape_run = true;
+    }
+
+    void release() override
+    {
+        _output.allocator()->free();
+    }
+
+    ITensor *get_weights() override
+    {
+        return &_output;
+    }
+
+    uint32_t uid() override
+    {
+        uint32_t id = (_B_pretranspose_size | 0x80000000);
+        return id;
+    }
+
+    void configure(size_t B_pretranspose_size, unsigned int alignment)
+    {
+        _output.allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
+        _B_pretranspose_size = B_pretranspose_size;
+    }
+
+    void set_pretranspose(ITensor *tensor)
+    {
+        if(!_reshape_run)
+        {
+            _gemm_kernel_asm->set_pretransposed_B_data(tensor->buffer());
+        }
+    }
+
+    void set_args(const int ldb, const TypeInput *in1_ptr, const int multi_stride_b, std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> gemm_kernel_asm)
+    {
+        _ldb             = ldb;
+        _in1_ptr         = in1_ptr;
+        _multi_stride_b  = multi_stride_b;
+        _gemm_kernel_asm = gemm_kernel_asm;
+    }
+
+private:
+    Tensor           _output{};
+    int              _ldb{};
+    const TypeInput *_in1_ptr{};
+    int              _multi_stride_b{};
+    size_t           _B_pretranspose_size{};
+    std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
+};
+
 /** Fallback in case ACL doesn't have a function */
 template <typename TypeInput, typename TypeOutput, class OutputStage = arm_gemm::Nothing>
 class Fallback : public NEGEMMAssemblyDispatch::IFallback
 {
 public:
+    /** Destructor */
+    ~Fallback()
+    {
+        // Release memory if we have allocated the memory ourselves
+        if(_pretranspose && !(_weights_manager && _weights_manager->are_weights_managed(_b)))
+        {
+            delete _pretranspose;
+        }
+    }
+
     /** Initialise the functions's input and output.
      *
-     * @param[in]  a            Input tensor containing the Matrix A.
-     * @param[in]  b            Input tensor containing the Matrix B.
-     * @param[in]  c            Input tensor containing the Matrix C.
-     * @param[out] d            Output tensor to store the result of matrix multiplication.
-     * @param[in]  args         Matrix multiplication information.
-     * @param[in]  gemm_info    GEMM meta-data
-     * @param[in]  memory_group Memory group to be used by the function.
-     * @param[in]  os           Output stage meta-data.
+     * @param[in]  a               Input tensor containing the Matrix A.
+     * @param[in]  b               Input tensor containing the Matrix B.
+     * @param[in]  c               Input tensor containing the Matrix C.
+     * @param[out] d               Output tensor to store the result of matrix multiplication.
+     * @param[in]  args            Matrix multiplication information.
+     * @param[in]  gemm_info       GEMM meta-data
+     * @param[in]  memory_group    Memory group to be used by the function.
+     * @param[in]  weights_manager Weights manager to be used by the function.
+     * @param[in]  os              Output stage meta-data.
      */
     void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d,
-                   arm_gemm::GemmArgs<TypeOutput> args, const GEMMInfo &gemm_info,
-                   MemoryGroup &memory_group, const OutputStage &os = {});
+                   arm_gemm::GemmArgs args, const GEMMInfo &gemm_info,
+                   MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os = {});
 
     // Inherited methods overridden:
     void run() override;
@@ -108,7 +170,7 @@
     void allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment);
 
     /** Assembly Gemm kernel */
-    std::unique_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
+    std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
     /** Optimised NEON kernel */
     std::unique_ptr<INEKernel> _optimised_kernel{ nullptr };
     /** Input A */
@@ -130,23 +192,30 @@
     /** GEMM workspace */
     Tensor _workspace{};
     /** Pre-transpose tensor */
-    Tensor _pretranspose{};
+    ITensor *_pretranspose{ nullptr };
     /** Prepared flag */
     bool _is_prepared{ false };
     /** GEMM meta-data */
     GEMMInfo _gemm_info{};
+    /** Weights manager */
+    IWeightsManager *_weights_manager{ nullptr };
+    /** Weights transform object */
+    FallbackTransform<TypeInput, TypeOutput> _weights_transform{};
+    /** GEMM kernel description */
+    arm_gemm::KernelDescription _kernel_info{};
 };
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
 void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d,
-                                                             arm_gemm::GemmArgs<TypeOutput> args, const GEMMInfo &gemm_info,
-                                                             MemoryGroup &memory_group, const OutputStage &os)
+                                                             arm_gemm::GemmArgs args, const GEMMInfo &gemm_info,
+                                                             MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os)
 {
-    arm_gemm::GemmConfig              gemm_cfg;
-    const arm_gemm::KernelDescription gemm_kernel_info = arm_gemm::get_gemm_method<TypeInput, TypeOutput, OutputStage>(args, os);
-    if(gemm_kernel_info.method != arm_gemm::GemmMethod::GEMV_BATCHED)
+    arm_gemm::GemmConfig gemm_cfg;
+    _kernel_info     = arm_gemm::get_gemm_method<TypeInput, TypeOutput, OutputStage>(args, os);
+    _weights_manager = weights_manager;
+    if(_kernel_info.method != arm_gemm::GemmMethod::GEMV_BATCHED)
     {
-        gemm_cfg.filter = gemm_kernel_info.name;
+        gemm_cfg.filter = _kernel_info.name;
         args._cfg       = &gemm_cfg;
     }
     _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(args, os);
@@ -190,7 +259,16 @@
         // Forcing 128-byte alignment (required by 32-bit kernels)
         const unsigned int alignment           = 128;
         const size_t       B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
-        _pretranspose.allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
+        if(weights_manager && _weights_manager->are_weights_managed(b))
+        {
+            _weights_transform.configure(B_pretranspose_size, alignment);
+            _pretranspose = _weights_manager->acquire(b, &_weights_transform);
+        }
+        else
+        {
+            _pretranspose = new Tensor();
+            static_cast<Tensor *>(_pretranspose)->allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
+        }
     }
 }
 
@@ -202,20 +280,34 @@
         // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
         if(_c && _c->info()->data_type() == DataType::S32)
         {
-            _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(_c->buffer() + _c->info()->offset_first_element_in_bytes()));
+            _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(_c->buffer() + _c->info()->offset_first_element_in_bytes()), 0);
         }
 
         // Pretranspose B if required
         if(_gemm_kernel_asm->B_pretranspose_required())
         {
-            _pretranspose.allocator()->allocate();
-            ARM_COMPUTE_ERROR_ON(_pretranspose.buffer() == nullptr);
             const int  ldb            = _b->info()->strides_in_bytes().y() / sizeof(TypeInput);
             const auto in1_ptr        = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes());
             const int  multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput);
 
-            _gemm_kernel_asm->pretranspose_B_array(_pretranspose.buffer(), in1_ptr, ldb, multi_stride_b);
-            _b->mark_as_unused();
+            if(_weights_manager && _weights_manager->are_weights_managed(_b))
+            {
+                _weights_transform.set_args(ldb, in1_ptr, multi_stride_b, _gemm_kernel_asm);
+                _weights_manager->run(_b, &_weights_transform);
+
+                // If we didn't run the reshape function, set the pretransposed buffer
+                if(!_weights_transform.is_reshape_run())
+                {
+                    _weights_transform.set_pretranspose(_pretranspose);
+                }
+            }
+            else
+            {
+                static_cast<Tensor *>(_pretranspose)->allocator()->allocate();
+                ARM_COMPUTE_ERROR_ON(_pretranspose->buffer() == nullptr);
+                _gemm_kernel_asm->pretranspose_B_array(_pretranspose->buffer(), in1_ptr, ldb, multi_stride_b);
+                _b->mark_as_unused();
+            }
         }
 
         _is_prepared = true;
@@ -284,105 +376,122 @@
     // Prepare assembly kernel
     prepare();
 
+    TypeOutput *bias = nullptr;
+    // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
+    if(_c && _c->info()->data_type() != DataType::S32)
+    {
+        bias = reinterpret_cast<TypeOutput *>(_c->buffer() + _c->info()->offset_first_element_in_bytes());
+    }
     // Set gemm parameters
-    _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr, ldd, batch_stride_d, multi_stride_d);
+    _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a,
+                                 in1_ptr, ldb, multi_stride_b,
+                                 out_ptr, ldd, batch_stride_d, multi_stride_d,
+                                 bias, 0);
 
     // Schedule assembly kernel
-    NEScheduler::get().schedule(_optimised_kernel.get(), Window::DimX);
-}
-
-template <typename TypeInput, typename TypeOutput>
-void create_function_or_arm_gemm(std::unique_ptr<IFunction> &acl_function, std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
-                                 const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info,
-                                 std::shared_ptr<IMemoryManager> memory_manager)
-{
-    INEGEMMWrapperKernel::Params p           = INEGEMMWrapperKernel::extract_parameters(a, b, d, gemm_info);
-    const CPUInfo               &ci          = NEScheduler::get().cpu_info();
-    unsigned int                 num_threads = NEScheduler::get().num_threads();
-
-    arm_gemm::GemmArgs<TypeOutput> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, gemm_info.pretranpose_B());
-
-    // Try to create an ACL function:
-    const arm_gemm::KernelDescription gemm_kernel_info = arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args);
-    acl_function                                       = create_function_all_types(gemm_kernel_info, a, b, d, alpha, beta, gemm_info, std::move(memory_manager));
-
-    // If we still don't have an ACL function:
-    if(acl_function == nullptr)
+    IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX);
+    if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && _d->info()->data_type() == DataType::F32)
     {
-        //Fallback onto arm_gemm function if ACL doesn't support this method.
-        auto fallback = support::cpp14::make_unique<Fallback<TypeInput, TypeOutput>>();
-        fallback->configure(a, b, c, d, args, gemm_info, memory_group);
-        arm_gemm = std::move(fallback);
+        const int granule_threshold = 200;
+        scheduling_hint             = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold);
     }
+    NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint);
 }
 
 template <typename TypeInput, typename TypeOutput>
-void create_function_or_arm_gemm_quant(std::unique_ptr<IFunction> &acl_function, std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
-                                       const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info,
-                                       std::shared_ptr<IMemoryManager> memory_manager)
+void create_arm_gemm(std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
+                     const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const GEMMInfo &gemm_info,
+                     IWeightsManager *weights_manager)
 {
     INEGEMMWrapperKernel::Params p           = INEGEMMWrapperKernel::extract_parameters(a, b, d, gemm_info);
     const CPUInfo               &ci          = NEScheduler::get().cpu_info();
     unsigned int                 num_threads = NEScheduler::get().num_threads();
 
-    arm_gemm::GemmArgs<TypeOutput> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, gemm_info.pretranpose_B());
+    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, activation, num_threads, gemm_info.pretranpose_B());
+
+    // Create arm_gemm fallback
+    auto fallback = support::cpp14::make_unique<Fallback<TypeInput, TypeOutput>>();
+    fallback->configure(a, b, c, d, args, gemm_info, memory_group, weights_manager);
+    arm_gemm = std::move(fallback);
+}
+
+template <typename TypeInput, typename TypeOutput>
+void create_arm_gemm_quant(std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
+                           const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const GEMMInfo &gemm_info,
+                           IWeightsManager *weights_manager)
+{
+    INEGEMMWrapperKernel::Params p           = INEGEMMWrapperKernel::extract_parameters(a, b, d, gemm_info);
+    const CPUInfo               &ci          = NEScheduler::get().cpu_info();
+    unsigned int                 num_threads = NEScheduler::get().num_threads();
+
+    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, activation, num_threads, gemm_info.pretranpose_B());
 
     // Configure requantization info
     const int32_t                 a_offset = -a->info()->quantization_info().uniform().offset;
     const int32_t                 b_offset = -b->info()->quantization_info().uniform().offset;
     const GEMMLowpOutputStageInfo os_info  = gemm_info.gemmlowp_output_stage();
 
-    const arm_gemm::ARequantizeLayer32 gemm_requant_info(nullptr,
+    const arm_gemm::ARequantizeLayer32 gemm_requant_info(nullptr, 0,
                                                          a_offset, b_offset, os_info.gemmlowp_offset,
                                                          -os_info.gemmlowp_shift, os_info.gemmlowp_multiplier,
                                                          os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
 
-    // Try to create an ACL function:
-    const arm_gemm::KernelDescription gemm_kernel_info = arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args, gemm_requant_info);
-    acl_function                                       = create_function_all_types(gemm_kernel_info, a, b, d, alpha, beta, gemm_info, std::move(memory_manager));
-
-    // If we still don't have an ACL function:
-    if(acl_function == nullptr)
-    {
-        // Fallback onto arm_gemm function if ACL doesn't support this method.
-        auto fallback = support::cpp14::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::ARequantizeLayer32>>();
-        fallback->configure(a, b, c, d, args, gemm_info, memory_group, gemm_requant_info);
-        arm_gemm = std::move(fallback);
-    }
+    // Create arm_gemm fallback
+    auto fallback = support::cpp14::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::ARequantizeLayer32>>();
+    fallback->configure(a, b, c, d, args, gemm_info, memory_group, weights_manager, gemm_requant_info);
+    arm_gemm = std::move(fallback);
 }
 
 } //namespace
 
-NEGEMMAssemblyDispatch::NEGEMMAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager)
-    : _function(nullptr), _arm_gemm(nullptr), _memory_group(memory_manager), _memory_manager(memory_manager)
+NEGEMMAssemblyDispatch::NEGEMMAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+    : _arm_gemm(nullptr), _memory_group(std::move(memory_manager)), _weights_manager(weights_manager)
 {
 }
 
-Status NEGEMMAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info)
+Status NEGEMMAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const GEMMInfo &gemm_info)
 {
-    ARM_COMPUTE_UNUSED(alpha, beta, gemm_info);
+    ARM_COMPUTE_UNUSED(gemm_info);
     ARM_COMPUTE_UNUSED(c);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
 #ifndef __aarch64__
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 || a->data_type() == DataType::S8 || a->data_type() == DataType::QASYMM8, "8bit integer types only supported for aarch64");
 #endif /* __aarch64__ */
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::U8, DataType::QASYMM8, DataType::S8, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
+                                                         DataType::F16, DataType::F32);
+    if(is_data_type_quantized_per_channel(b->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+    }
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && d->data_type() != DataType::QASYMM8, "Only QASYMM8 output supported for QASYMM8 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8_SIGNED && d->data_type() != DataType::S32, "Only S32 output supported for QASYMM8_SIGNED input");
     return Status{};
 }
 
-void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info)
+bool NEGEMMAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation)
+{
+    arm_gemm::Activation act = map_to_arm_gemm_activation(activation);
+    return act.type != arm_gemm::Activation::Type::None;
+}
+
+void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
+    arm_gemm::Activation act = map_to_arm_gemm_activation(gemm_info.activation_info());
 
     //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
-    if(!NEGEMMAssemblyDispatch::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, d->info(), alpha, beta, gemm_info))
+    if(!NEGEMMAssemblyDispatch::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, d->info(), gemm_info))
     {
         return;
     }
@@ -390,27 +499,28 @@
     switch(a->info()->data_type())
     {
         case DataType::F32:
-            create_function_or_arm_gemm<float, float>(_function, _arm_gemm, _memory_group, a, b, c, d, alpha, beta, gemm_info, _memory_manager);
+            create_arm_gemm<float, float>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
             break;
 #ifdef __aarch64__
         case DataType::U8:
         case DataType::QASYMM8:
             if(d->info()->data_type() == DataType::S32)
             {
-                create_function_or_arm_gemm<uint8_t, uint32_t>(_function, _arm_gemm, _memory_group, a, b, c, d, alpha, beta, gemm_info, _memory_manager);
+                create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
             }
             else
             {
-                create_function_or_arm_gemm_quant<uint8_t, uint8_t>(_function, _arm_gemm, _memory_group, a, b, c, d, alpha, beta, gemm_info, _memory_manager);
+                create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
             }
             break;
         case DataType::S8:
-            create_function_or_arm_gemm<int8_t, int32_t>(_function, _arm_gemm, _memory_group, a, b, c, d, alpha, beta, gemm_info, _memory_manager);
+        case DataType::QASYMM8_SIGNED:
+            create_arm_gemm<int8_t, int32_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
             break;
 #endif /* __aarch64__ */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            create_function_or_arm_gemm<float16_t, float16_t>(_function, _arm_gemm, _memory_group, a, b, c, d, alpha, beta, gemm_info, _memory_manager);
+            create_arm_gemm<float16_t, float16_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
             break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         default:
@@ -420,33 +530,20 @@
 
 void NEGEMMAssemblyDispatch::prepare()
 {
-    if(_function != nullptr)
-    {
-        _function->prepare();
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
-        _arm_gemm->prepare();
-    }
+    ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
+    _arm_gemm->prepare();
 }
 
 bool NEGEMMAssemblyDispatch::is_configured() const
 {
-    return (_arm_gemm != nullptr && _arm_gemm->is_configured()) || _function != nullptr;
+    return _arm_gemm != nullptr && _arm_gemm->is_configured();
 }
 
 void NEGEMMAssemblyDispatch::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
-    if(_function != nullptr)
-    {
-        _function->run();
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
-        _arm_gemm->run();
-    }
+
+    ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
+    _arm_gemm->run();
 }
 } //namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index e94c893..a730749 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp

@@ -29,9 +29,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "support/ToolchainSupport.h"
 
-#include <cmath>
 #include <set>
 #include <tuple>
 
@@ -50,7 +48,6 @@
     ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayerReshapeWeights::validate(weights->info(),
                                                                           (biases != nullptr) ? biases->info() : nullptr,
                                                                           output->info()));
-
     const bool     append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
     const ITensor *biases_to_use = (append_biases) ? biases : nullptr;
 
@@ -62,7 +59,7 @@
 Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
 
     if(biases != nullptr)
@@ -89,105 +86,99 @@
     NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
 }
 
-NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
-    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _add_bias_kernel(),
-      _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _append_bias(false), _skip_im2col(false),
-      _skip_col2im(false), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
+NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager, IWeightsManager *weights_manager)
+    : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager),
+      _col2im_kernel(), _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _skip_im2col(false),
+      _skip_col2im(false), _is_quantized(false), _is_prepared(false)
 {
 }
 
 void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act_info, int gemm_3d_depth)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output == nullptr ? nullptr : output->info(), act_info, gemm_3d_depth,
-                                           _skip_im2col));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output == nullptr ? nullptr : output->info(),
+                                           act_info, gemm_3d_depth, _skip_im2col));
 
+    // Create GEMMInfo structure
     const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
-                                         gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */);
+                                         gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
+                                         false, GEMMLowpOutputStageInfo(), false, false, act_info);
+
+    // Supported activations in GEMM
+    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+                                                                               ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+                                                                               ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+                                                                             };
 
     if(_is_quantized)
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
-        const UniformQuantizationInfo iqinfo = input->info()->quantization_info().uniform();
-        const UniformQuantizationInfo wqinfo = weights->info()->quantization_info().uniform();
+        const QuantizationInfo        iqinfo  = input->info()->quantization_info();
+        const QuantizationInfo        wqinfo  = weights->info()->quantization_info();
+        const QuantizationInfo        oqinfo  = (output->info()->total_size() == 0) ? iqinfo : output->info()->quantization_info();
+        const UniformQuantizationInfo uiqinfo = iqinfo.uniform();
+        const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
 
-        input->info()->set_quantization_info(QuantizationInfo(iqinfo.scale, -iqinfo.offset));
-        weights->info()->set_quantization_info(QuantizationInfo(wqinfo.scale, -wqinfo.offset));
-
-        const UniformQuantizationInfo oqinfo = (output->info()->total_size() == 0) ? iqinfo : output->info()->quantization_info().uniform();
-
-        float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
-        int   output_multiplier;
-        int   output_shift;
-        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+        input->info()->set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset));
+        if(!is_data_type_quantized_per_channel(weights->info()->data_type()))
+        {
+            const UniformQuantizationInfo uwqinfo = wqinfo.uniform();
+            weights->info()->set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset));
+        }
 
         // Merge activation with output stage
         int min_activation = 0;
         int max_activation = 255;
 
-        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                                 };
-        if(_is_activationlayer_enabled && supported_acts.count(act_info.activation()) != 0)
+        if(supported_acts.count(act_info.activation()) != 0)
         {
-            const int a_const_int = quantize_qasymm8(act_info.a(), oqinfo);
-            const int b_const_int = quantize_qasymm8(act_info.b(), oqinfo);
+            const int a_const_int = quantize_qasymm8(act_info.a(), uoqinfo);
+            const int b_const_int = quantize_qasymm8(act_info.b(), uoqinfo);
 
-            min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oqinfo.offset : b_const_int;
+            min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? uoqinfo.offset : b_const_int;
             max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
-
-            _is_activationlayer_enabled = false;
         }
 
         GEMMLowpOutputStageInfo output_info;
-        output_info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-        output_info.gemmlowp_offset     = oqinfo.offset;
-        output_info.gemmlowp_multiplier = output_multiplier;
-        output_info.gemmlowp_shift      = output_shift;
-        output_info.gemmlowp_min_bound  = min_activation;
-        output_info.gemmlowp_max_bound  = max_activation;
+        output_info.type               = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+        output_info.gemmlowp_offset    = uoqinfo.offset;
+        output_info.gemmlowp_min_bound = min_activation;
+        output_info.gemmlowp_max_bound = max_activation;
+        quantization::calculate_quantized_multipliers_less_than_one(iqinfo, wqinfo, oqinfo, output_info);
 
         _mm_gemmlowp.configure(input, weights, biases, output, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info));
 
         // Revert back QuantizatioInfo as input and weights could be used in other convolution layers
-        input->info()->set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset));
-        weights->info()->set_quantization_info(QuantizationInfo(wqinfo.scale, wqinfo.offset));
+        input->info()->set_quantization_info(iqinfo);
+        weights->info()->set_quantization_info(wqinfo);
     }
     else
     {
         // Configure matrix multiply function
-        _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
+        _mm_gemm.configure(input, weights, biases, output, 1.0f, 0.0f, gemm_info);
     }
 }
 
-Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ActivationLayerInfo &act_info,
-                                           int gemm_3d_depth, bool skip_im2col)
+Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+                                           const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
 {
     const bool is_quantized          = is_data_type_quantized_asymmetric(input->data_type());
     const bool is_activation_enabled = act_info.enabled();
 
-    const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
-                                         gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */);
+    // Create GEMMInfo structure
+    const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
+                                        gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
+                                        false, GEMMLowpOutputStageInfo(), false, false, act_info);
+
     if(is_quantized)
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
-        const UniformQuantizationInfo iqinfo = input->quantization_info().uniform();
-        const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
-
-        std::unique_ptr<ITensorInfo> input_qa   = input->clone();
-        std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
-        input_qa->set_quantization_info(QuantizationInfo(iqinfo.scale, -iqinfo.offset));
-        weights_qa->set_quantization_info(QuantizationInfo(wqinfo.scale, -wqinfo.offset));
-
-        const UniformQuantizationInfo oqinfo = (output->total_size() == 0) ? iqinfo : output->quantization_info().uniform();
-
-        float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
-        int   output_multiplier;
-        int   output_shift;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
+        const QuantizationInfo       &iqinfo  = input->quantization_info();
+        const QuantizationInfo       &wqinfo  = weights->quantization_info();
+        const QuantizationInfo       &oqinfo  = (output->total_size() == 0) ? iqinfo : output->quantization_info();
+        const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
 
         // Merge activation with output stage
         int min_activation = 0;
@@ -199,22 +190,25 @@
                                                                                  };
         if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
         {
-            const int a_const_int = quantize_qasymm8(act_info.a(), oqinfo);
-            const int b_const_int = quantize_qasymm8(act_info.b(), oqinfo);
+            const int a_const_int = quantize_qasymm8(act_info.a(), uoqinfo);
+            const int b_const_int = quantize_qasymm8(act_info.b(), uoqinfo);
 
-            min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oqinfo.offset : b_const_int;
+            min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? uoqinfo.offset : b_const_int;
             max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
         }
 
         GEMMLowpOutputStageInfo output_info;
-        output_info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-        output_info.gemmlowp_offset     = oqinfo.offset;
-        output_info.gemmlowp_multiplier = output_multiplier;
-        output_info.gemmlowp_shift      = output_shift;
-        output_info.gemmlowp_min_bound  = min_activation;
-        output_info.gemmlowp_max_bound  = max_activation;
+        output_info.type               = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+        output_info.gemmlowp_offset    = uoqinfo.offset;
+        output_info.gemmlowp_min_bound = min_activation;
+        output_info.gemmlowp_max_bound = max_activation;
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers_less_than_one(iqinfo, wqinfo, oqinfo, output_info));
 
         // Perform validation step on GEMMLowp
+        std::unique_ptr<ITensorInfo> input_qa   = input->clone();
+        std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
+        input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset));
+        weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset));
         return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info));
     }
     else
@@ -224,7 +218,7 @@
     }
 }
 
-Status NEGEMMConvolutionLayer::validate_gemm3d(const ITensorInfo *input_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
+Status NEGEMMConvolutionLayer::validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
 {
     const DataType     data_type = input_info->data_type();
     const unsigned int mult_y    = skip_im2col ? 1U : gemm_3d_depth;
@@ -232,7 +226,7 @@
 
     // Set dummy tensor shapes for the validation
     const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info());
-    const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type);
+    const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info());
     const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info());
 
     return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, gemm_3d_depth, skip_im2col);
@@ -242,7 +236,7 @@
                                        const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_UNUSED(num_groups);
+    ARM_COMPUTE_UNUSED(num_groups, weights_info);
     ARM_COMPUTE_ERROR_THROW_ON(NEGEMMConvolutionLayer::validate(input->info(),
                                                                 weights->info(),
                                                                 biases != nullptr ? biases->info() : nullptr,
@@ -262,13 +256,11 @@
     const unsigned int kernel_width  = weights->info()->dimension(idx_width);
     const unsigned int kernel_height = weights->info()->dimension(idx_height);
 
-    _is_prepared                = weights_info.retain_internal_weights();
-    _original_weights           = weights;
-    _is_quantized               = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _data_layout                = data_layout;
-    _skip_im2col                = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-    _append_bias                = (biases != nullptr) && (!_is_quantized);
-    _is_activationlayer_enabled = act_info.enabled();
+    _is_prepared      = weights_info.retain_internal_weights();
+    _original_weights = weights;
+    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _data_layout      = data_layout;
+    _skip_im2col      = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
 
     const ITensor *gemm_input_to_use  = input;
     ITensor       *gemm_output_to_use = output;
@@ -286,7 +278,7 @@
     // Check if GEMM3D is supported
     if(data_layout == DataLayout::NHWC)
     {
-        _skip_col2im = bool(validate_gemm3d(input->info(), act_info, conv_h, true));
+        _skip_col2im = bool(validate_gemm3d(input->info(), weights->info(), act_info, conv_h, true));
         // If not supported, we need to perform im2col and col2im (or reshape layer)
         if(!_skip_col2im)
         {
@@ -298,8 +290,6 @@
         _skip_col2im = false;
     }
 
-    const ITensor *biases_to_use = (_append_bias && !_skip_im2col) ? biases : nullptr;
-
     // Get parameters from conv_info
     unsigned int stride_x = 0;
     unsigned int stride_y = 0;
@@ -309,7 +299,18 @@
 
     // _weights_reshaped will be auto configured in the kernel.
     // Just append biases and do not transpose 1xW as it will be reshaped in NEGEMM
-    _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped);
+    const ITensor *weights_to_use = weights;
+
+    if(_weights_manager && _weights_manager->are_weights_managed(weights))
+    {
+        _reshape_weights_managed.configure(weights, nullptr);
+        weights_to_use = _weights_manager->acquire(weights, &_reshape_weights_managed);
+    }
+    else
+    {
+        _reshape_weights.configure(weights, nullptr, &_weights_reshaped);
+        weights_to_use = &_weights_reshaped;
+    }
 
     // Create tensor to store im2col reshaped inputs
     if(!_skip_im2col)
@@ -317,16 +318,11 @@
         _memory_group.manage(&_im2col_output);
 
         // Configure
-        _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, _append_bias, dilation);
+        _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation);
 
         // Update GEMM input
         gemm_input_to_use = &_im2col_output;
     }
-    else if(_append_bias)
-    {
-        // Configure add bias kernel
-        _add_bias_kernel.configure(output, biases, output, ConvertPolicy::SATURATE);
-    }
 
     // Create temporary GEMM output tensor in case we cannot skip col2im
     if(!_skip_col2im)
@@ -351,7 +347,7 @@
     // Configure GEMM
     // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
-    configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, gemm_3d_depth);
+    configure_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, gemm_3d_depth);
 
     if(!_skip_im2col)
     {
@@ -384,14 +380,6 @@
 
     ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h),
                              "Output shape does not match the expected one");
-
-    // Configure Activation Layer
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.configure(output, nullptr, act_info);
-    }
-
-    ARM_COMPUTE_UNUSED(weights_info);
 }
 
 Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
@@ -400,7 +388,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported on NEON");
 
@@ -422,10 +410,9 @@
     const ITensorInfo *gemm_output_to_use = output;
     const ITensorInfo *weights_to_use     = weights;
 
-    const bool is_quantized          = is_data_type_quantized_asymmetric(data_type);
-    const bool append_bias           = (biases != nullptr) && (!is_quantized);
-    bool       skip_im2col           = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-    bool       is_activation_enabled = act_info.enabled();
+    const bool append_bias  = false;
+    const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
+    bool       skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
 
     // Get convolved dimensions
     unsigned int conv_w = 0;
@@ -442,7 +429,7 @@
     bool skip_col2im = false;
     if(data_layout == DataLayout::NHWC)
     {
-        skip_col2im = bool(validate_gemm3d(input, act_info, conv_h, true));
+        skip_col2im = bool(validate_gemm3d(input, weights, act_info, conv_h, true));
         // If not supported, we need to perform im2col and col2im (or reshape layer)
         if(!skip_col2im)
         {
@@ -453,16 +440,13 @@
     if(skip_col2im)
     {
         // If not supported, we need to perform im2col and col2im (or reshape layer)
-        if(!bool(validate_gemm3d(input, act_info, conv_h, skip_im2col)))
+        if(!bool(validate_gemm3d(input, weights, act_info, conv_h, skip_im2col)))
         {
             skip_im2col = false;
             skip_col2im = false;
         }
     }
 
-    const unsigned     bias_element  = (append_bias && !skip_im2col) ? 1 : 0;
-    const ITensorInfo *biases_to_use = (append_bias && !skip_im2col) ? biases : nullptr;
-
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != input->dimension(idx_channel));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
 
@@ -481,19 +465,14 @@
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
     }
 
-    if(act_info.enabled())
-    {
-        ARM_COMPUTE_ERROR_ON(act_info.b() > act_info.a());
-    }
-
     unsigned int mat_weights_cols = weights->dimension(idx_kernels);
-    unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel) + bias_element;
+    unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel);
 
     // Output tensor auto inizialization if not yet initialized
-    ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases_to_use, nullptr));
-    weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, (append_bias && !skip_im2col)), 1, data_type);
+    ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, nullptr, nullptr));
+    weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, append_bias), 1, data_type);
     weights_reshaped_info.set_quantization_info(weights->quantization_info());
-    weights_to_use        = &weights_reshaped_info;
+    weights_to_use = &weights_reshaped_info;
 
     if(!skip_im2col)
     {
@@ -511,11 +490,6 @@
         ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
         gemm_input_to_use = &im2col_reshaped_info;
     }
-    else if(append_bias)
-    {
-        // Validate add bias kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(output, biases, output, ConvertPolicy::SATURATE));
-    }
 
     // Create temporary GEMM output tensor in case we cannot skip col2im
     if(!skip_col2im)
@@ -539,12 +513,6 @@
         ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(gemm_output_to_use, output, Size2D(conv_w, conv_h)));
     }
 
-    //Validate Activation Layer
-    if(is_activation_enabled)
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
-    }
-
     return Status{};
 }
 
@@ -573,11 +541,6 @@
         _mm_gemm.run();
     }
 
-    if(_skip_im2col && _append_bias)
-    {
-        NEScheduler::get().schedule(&_add_bias_kernel, Window::DimY);
-    }
-
     // Reshape output matrix
     if(!_skip_col2im)
     {
@@ -590,23 +553,23 @@
             _reshape_layer.run();
         }
     }
-
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.run();
-    }
 }
 
 void NEGEMMConvolutionLayer::prepare()
 {
     if(!_is_prepared)
     {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        // Run weights reshaping and mark original weights tensor as unused
-        _weights_reshaped.allocator()->allocate();
-        _reshape_weights.run();
-        _original_weights->mark_as_unused();
+        if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
+        {
+            _weights_manager->run(_original_weights, &_reshape_weights_managed);
+        }
+        else
+        {
+            // Run weights reshaping and mark original weights tensor as unused
+            _weights_reshaped.allocator()->allocate();
+            _reshape_weights.run();
+            _original_weights->mark_as_unused();
+        }
 
         // Prepare GEMM
         _is_quantized ? _mm_gemmlowp.prepare() : _mm_gemm.prepare();

diff --git a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
index 63f330b..a478fdd 100644
--- a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
+++ b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,11 +26,12 @@
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NEGEMMInterleave4x4::configure(const ITensor *input, ITensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
index aa40113..346d025 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp

@@ -59,7 +59,7 @@
         case DataType::QASYMM8:
         case DataType::U8:
         {
-            _asm_glue.configure(a, b, c, output, 1.f, 0.f, GEMMInfo(false, false, true));
+            _asm_glue.configure(a, b, c, output, GEMMInfo(false, false, true));
             run_optimised = _asm_glue.is_configured();
             break;
         }

diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index a03ec10..5b9d055 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp

@@ -42,8 +42,9 @@
 
 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
-      _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _mm_result_s32(), _original_b(nullptr), _a_offset(0), _b_offset(0),
-      _run_vector_matrix_multiplication(false), _assembly_path(false), _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false)
+      _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), _convert_to_signed_asymm(), _convert_from_signed_asymm(), _vector_sum_col(), _vector_sum_row(), _tmp_a(),
+      _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false),
+      _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false), _run_activation(false), _flip_signedness(false)
 {
 }
 
@@ -55,6 +56,7 @@
 
     const ITensor *matrix_a = a;
     const ITensor *matrix_b = b;
+    GEMMInfo       info     = gemm_info;
 
     // Clear state
     _mtx_a_reshape_kernel = nullptr;
@@ -64,13 +66,44 @@
     _a_offset                         = a->info()->quantization_info().uniform().offset;
     _b_offset                         = b->info()->quantization_info().uniform().offset;
     _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
-    _reshape_b_only_on_first_run      = gemm_info.reshape_b_only_on_first_run();
+    _reshape_b_only_on_first_run      = info.reshape_b_only_on_first_run();
     _is_prepared                      = false;
     _fused_assembly_path              = false;
+    _flip_signedness                  = is_data_type_quantized_per_channel(b->info()->data_type()) && (a->info()->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
     _original_b                       = b;
 
+    const ITensor *a_to_use = a;
+
+    // Convert to QASYMM8 -> QASYMM8_SIGNED and back
+    if(_flip_signedness)
+    {
+        const int32_t                 offset_correction = 128;
+        const DataType                dt                = DataType::QASYMM8_SIGNED;
+        const UniformQuantizationInfo iqinfo            = a_to_use->info()->quantization_info().uniform();
+
+        _signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)));
+        _memory_group.manage(&_signed_a);
+        _convert_to_signed_asymm.configure(a_to_use, &_signed_a);
+        a_to_use  = &_signed_a;
+        _a_offset = _signed_a.info()->quantization_info().uniform().offset;
+
+        const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform();
+        _memory_group.manage(&_signed_output);
+        _signed_output.allocator()->init(output->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)));
+
+        // Output stage correction
+        GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
+        output_stage_corr.gemmlowp_offset         = _signed_output.info()->quantization_info().uniform().offset;
+        output_stage_corr.gemmlowp_min_bound -= offset_correction;
+        output_stage_corr.gemmlowp_max_bound -= offset_correction;
+        info.set_gemmlowp_output_stage(output_stage_corr);
+
+        // Update matrix a
+        matrix_a = &_signed_a;
+    }
+
     // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
-    if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+    if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
     {
         _fuse_output_stage = true;
         _memory_group.manage(&_mm_result_s32);
@@ -82,17 +115,18 @@
     switch(a->info()->data_type())
     {
         case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
         case DataType::U8:
         case DataType::S8:
         {
-            if(a->info()->data_type() == DataType::QASYMM8 && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+            if(a_to_use->info()->data_type() == DataType::QASYMM8 && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
             {
-                _asm_glue.configure(a, b, c, output, 1.f, 0.f, gemm_info);
+                _asm_glue.configure(a_to_use, b, c, output, gemm_info);
                 _fused_assembly_path = _asm_glue.is_configured();
             }
             else
             {
-                _asm_glue.configure(a, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, 1.f, 0.f, gemm_info);
+                _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, gemm_info);
             }
             _assembly_path = _asm_glue.is_configured();
             break;
@@ -110,7 +144,7 @@
         matrix_b = &_tmp_b;
 
         // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-        TensorInfo a_info(compute_interleaved_shape(*a->info()), 1, a->info()->data_type(), a->info()->quantization_info());
+        TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, a_to_use->info()->data_type(), a_to_use->info()->quantization_info());
         // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
         TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info());
         _tmp_a.allocator()->init(a_info);
@@ -124,7 +158,7 @@
         // Configure interleave kernel
         {
             auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
-            k->configure(a, &_tmp_a);
+            k->configure(a_to_use, &_tmp_a);
             _mtx_a_reshape_kernel = std::move(k);
         }
 
@@ -150,19 +184,19 @@
             }
 
             // Configure Matrix B reduction kernel
-            _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a->info()->dimension(0), false);
+            _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false);
         }
 
         // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
         if(_b_offset != 0)
         {
-            TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32);
+            TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);
 
             _vector_sum_row.allocator()->init(info_vector_sum_row);
             _memory_group.manage(&_vector_sum_row);
 
             // Configure matrix A reduction kernel
-            _mtx_a_reduction_kernel.configure(a, &_vector_sum_row, a->info()->dimension(0), false);
+            _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0), false);
         }
 
         if(_fuse_output_stage)
@@ -175,8 +209,17 @@
                 _mm_kernel = std::move(k);
             }
 
-            _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
-                                                               _a_offset, _b_offset, gemm_info.gemmlowp_output_stage());
+            _offset_contribution_output_stage_kernel.configure(&_mm_result_s32,
+                                                               _a_offset == 0 ? nullptr : &_vector_sum_col,
+                                                               _b_offset == 0 ? nullptr : &_vector_sum_row, c,
+                                                               _flip_signedness ? &_signed_output : output,
+                                                               a->info()->dimension(0),
+                                                               _a_offset, _b_offset, info.gemmlowp_output_stage());
+
+            if(_flip_signedness)
+            {
+                _convert_from_signed_asymm.configure(&_signed_output, output);
+            }
         }
         else
         {
@@ -188,10 +231,18 @@
                 _mm_kernel = std::move(k);
             }
             // Configure offset contribution kernel
-            _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
+            _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset);
         }
     }
 
+    // Configure activation
+    const ActivationLayerInfo &activation = gemm_info.activation_info();
+    _run_activation                       = activation.enabled() && (!_assembly_path || (_assembly_path && !NEGEMMAssemblyDispatch::is_activation_supported(activation)));
+    if(_run_activation)
+    {
+        _activation_func.configure(output, nullptr, activation);
+    }
+
     // Allocate tensors
     if(!_assembly_path && !_run_vector_matrix_multiplication)
     {
@@ -219,22 +270,31 @@
     {
         _mm_result_s32.allocator()->allocate();
     }
+
+    if(_flip_signedness)
+    {
+        _signed_a.allocator()->allocate();
+        _signed_output.allocator()->allocate();
+    }
 }
 
 Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
                                     "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
 
+    GEMMInfo           info          = gemm_info;
     const ITensorInfo *matrix_a_info = a;
     const ITensorInfo *matrix_b_info = b;
 
+    const ITensorInfo *a_to_use = a;
+
     TensorInfo tmp_a_info{};
     TensorInfo tmp_b_info{};
     TensorInfo mm_result_s32_info{};
@@ -242,31 +302,60 @@
     int32_t a_offset = a->quantization_info().uniform().offset;
     int32_t b_offset = b->quantization_info().uniform().offset;
 
-    bool fuse_output_stage = gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
+    bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
     if(fuse_output_stage)
     {
         auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
     }
 
+    // Convert QASYMM8->QASYMM8_SIGNED
+    TensorInfo signed_a{};
+    TensorInfo signed_output{};
+    bool       flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
+    if(flip_signedness)
+    {
+        const int32_t                 offset_correction = 128;
+        const DataType                dt                = DataType::QASYMM8_SIGNED;
+        const UniformQuantizationInfo iqinfo            = a_to_use->quantization_info().uniform();
+
+        signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));
+        a_to_use = &signed_a;
+        a_offset = signed_a.quantization_info().uniform().offset;
+
+        const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
+        signed_output                        = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
+
+        // Output stage correction
+        GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
+        output_stage_corr.gemmlowp_offset         = signed_output.quantization_info().uniform().offset;
+        output_stage_corr.gemmlowp_min_bound -= offset_correction;
+        output_stage_corr.gemmlowp_max_bound -= offset_correction;
+        info.set_gemmlowp_output_stage(output_stage_corr);
+
+        // Update matrix a
+        matrix_a_info = &signed_a;
+    }
+
     // Check if we need to run the optimized assembly kernel
     bool run_optimised             = false;
     bool run_optimised_requantized = false;
-    if(is_data_type_quantized_asymmetric(a->data_type()))
+    if(a_to_use->data_type() == DataType::QASYMM8 && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
-        run_optimised             = bool(NEGEMMAssemblyDispatch::validate(a, b, c, output, 1.f, 0.f, gemm_info));
+        run_optimised             = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));
         run_optimised_requantized = run_optimised;
     }
     else
     {
-        run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, 1.f, 0.f, gemm_info));
+        run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, gemm_info));
     }
 
     if(run_optimised)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
-        if(gemm_info.depth_output_gemm3d() != 0)
+        if(info.depth_output_gemm3d() != 0)
         {
-            if(gemm_info.reinterpret_input_as_3d())
+            if(info.reinterpret_input_as_3d())
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
                 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
@@ -283,8 +372,8 @@
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
 
         const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
         if(!run_vector_matrix_multiplication)
@@ -303,10 +392,10 @@
             shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
 
             // Validate interleave kernel
-            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(shape_tmp_a));
+            auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
             auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
 
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &tmp_a_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
             ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
         }
     }
@@ -331,7 +420,7 @@
             info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
 
             // Configure matrix A reduction kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, a->dimension(0), false));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, a->dimension(0), false));
         }
 
         if(fuse_output_stage)
@@ -345,8 +434,10 @@
             ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
                                                                                                 a_offset == 0 ? nullptr : &info_vector_sum_col,
                                                                                                 b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                                c, output, a_offset, b_offset,
-                                                                                                gemm_info.gemmlowp_output_stage()));
+                                                                                                c,
+                                                                                                flip_signedness ? &signed_output : output,
+                                                                                                a_offset, b_offset,
+                                                                                                info.gemmlowp_output_stage()));
         }
         else
         {
@@ -361,6 +452,14 @@
                                                                                      a_offset, b_offset));
         }
     }
+
+    // Validate activation
+    const ActivationLayerInfo &activation = gemm_info.activation_info();
+    if(activation.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, activation));
+    }
+
     return Status{};
 }
 
@@ -370,6 +469,12 @@
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
+    // Convert QASYMM8->QASYMM8_SIGNED
+    if(_flip_signedness)
+    {
+        NEScheduler::get().schedule(&_convert_to_signed_asymm, Window::DimY);
+    }
+
     // Reshape inputs
     if(_mtx_a_reshape_kernel)
     {
@@ -415,6 +520,18 @@
             NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
         }
     }
+
+    // Convert QASYMM8_SIGNED->QASYMM8
+    if(_flip_signedness)
+    {
+        NEScheduler::get().schedule(&_convert_from_signed_asymm, Window::DimY);
+    }
+
+    // Run fused activation
+    if(_run_activation)
+    {
+        _activation_func.run();
+    }
 }
 
 void NEGEMMLowpMatrixMultiplyCore::prepare()

diff --git a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
new file mode 100644
index 0000000..7f25b63
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp

@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+NEGenerateProposalsLayer::NEGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager),
+      _permute_deltas_kernel(),
+      _flatten_deltas_kernel(),
+      _permute_scores_kernel(),
+      _flatten_scores_kernel(),
+      _compute_anchors_kernel(),
+      _bounding_box_kernel(),
+      _pad_kernel(),
+      _dequantize_anchors(),
+      _dequantize_deltas(),
+      _quantize_all_proposals(),
+      _cpp_nms(memory_manager),
+      _is_nhwc(false),
+      _is_qasymm8(false),
+      _deltas_permuted(),
+      _deltas_flattened(),
+      _deltas_flattened_f32(),
+      _scores_permuted(),
+      _scores_flattened(),
+      _all_anchors(),
+      _all_anchors_f32(),
+      _all_proposals(),
+      _all_proposals_quantized(),
+      _keeps_nms_unused(),
+      _classes_nms_unused(),
+      _proposals_4_roi_values(),
+      _all_proposals_to_use(nullptr),
+      _num_valid_proposals(nullptr),
+      _scores_out(nullptr)
+{
+}
+
+void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *deltas, const ITensor *anchors, ITensor *proposals, ITensor *scores_out, ITensor *num_valid_proposals,
+                                         const GenerateProposalsInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
+    ARM_COMPUTE_ERROR_THROW_ON(NEGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
+
+    _is_nhwc                        = scores->info()->data_layout() == DataLayout::NHWC;
+    const DataType scores_data_type = scores->info()->data_type();
+    _is_qasymm8                     = scores_data_type == DataType::QASYMM8;
+    const int    num_anchors        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
+    const int    feat_width         = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
+    const int    feat_height        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
+    const int    total_num_anchors  = num_anchors * feat_width * feat_height;
+    const int    pre_nms_topN       = info.pre_nms_topN();
+    const int    post_nms_topN      = info.post_nms_topN();
+    const size_t values_per_roi     = info.values_per_roi();
+
+    const QuantizationInfo scores_qinfo   = scores->info()->quantization_info();
+    const DataType         rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type;
+    const QuantizationInfo rois_qinfo     = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
+
+    // Compute all the anchors
+    _memory_group.manage(&_all_anchors);
+    _compute_anchors_kernel.configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
+
+    const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
+    _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
+
+    // Permute and reshape deltas
+    _memory_group.manage(&_deltas_flattened);
+    if(!_is_nhwc)
+    {
+        _memory_group.manage(&_deltas_permuted);
+        _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+        _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
+        _deltas_permuted.allocator()->allocate();
+    }
+    else
+    {
+        _flatten_deltas_kernel.configure(deltas, &_deltas_flattened);
+    }
+
+    const TensorShape flatten_shape_scores(1, total_num_anchors);
+    _scores_flattened.allocator()->init(TensorInfo(flatten_shape_scores, 1, scores_data_type, scores_qinfo));
+
+    // Permute and reshape scores
+    _memory_group.manage(&_scores_flattened);
+    if(!_is_nhwc)
+    {
+        _memory_group.manage(&_scores_permuted);
+        _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+        _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
+        _scores_permuted.allocator()->allocate();
+    }
+    else
+    {
+        _flatten_scores_kernel.configure(scores, &_scores_flattened);
+    }
+
+    Tensor *anchors_to_use = &_all_anchors;
+    Tensor *deltas_to_use  = &_deltas_flattened;
+    if(_is_qasymm8)
+    {
+        _all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32));
+        _deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32));
+        _memory_group.manage(&_all_anchors_f32);
+        _memory_group.manage(&_deltas_flattened_f32);
+        // Dequantize anchors to float
+        _dequantize_anchors.configure(&_all_anchors, &_all_anchors_f32);
+        _all_anchors.allocator()->allocate();
+        anchors_to_use = &_all_anchors_f32;
+        // Dequantize deltas to float
+        _dequantize_deltas.configure(&_deltas_flattened, &_deltas_flattened_f32);
+        _deltas_flattened.allocator()->allocate();
+        deltas_to_use = &_deltas_flattened_f32;
+    }
+    // Bounding box transform
+    _memory_group.manage(&_all_proposals);
+    BoundingBoxTransformInfo bbox_info(info.im_width(), info.im_height(), 1.f);
+    _bounding_box_kernel.configure(anchors_to_use, &_all_proposals, deltas_to_use, bbox_info);
+    deltas_to_use->allocator()->allocate();
+    anchors_to_use->allocator()->allocate();
+
+    _all_proposals_to_use = &_all_proposals;
+    if(_is_qasymm8)
+    {
+        _memory_group.manage(&_all_proposals_quantized);
+        // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset
+        _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
+        _quantize_all_proposals.configure(&_all_proposals, &_all_proposals_quantized);
+        _all_proposals.allocator()->allocate();
+        _all_proposals_to_use = &_all_proposals_quantized;
+    }
+    // The original layer implementation first selects the best pre_nms_topN anchors (thus having a lightweight sort)
+    // that are then transformed by bbox_transform. The boxes generated are then fed into a non-sorting NMS operation.
+    // Since we are reusing the NMS layer and we don't implement any CL/sort, we let NMS do the sorting (of all the input)
+    // and the filtering
+    const int   scores_nms_size = std::min<int>(std::min<int>(post_nms_topN, pre_nms_topN), total_num_anchors);
+    const float min_size_scaled = info.min_size() * info.im_scale();
+    _memory_group.manage(&_classes_nms_unused);
+    _memory_group.manage(&_keeps_nms_unused);
+
+    // Note that NMS needs outputs preinitialized.
+    auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo);
+    auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo);
+    auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32);
+
+    // Initialize temporaries (unused) outputs
+    _classes_nms_unused.allocator()->init(TensorInfo(TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo));
+    _keeps_nms_unused.allocator()->init(*scores_out->info());
+
+    // Save the output (to map and unmap them at run)
+    _scores_out          = scores_out;
+    _num_valid_proposals = num_valid_proposals;
+
+    _memory_group.manage(&_proposals_4_roi_values);
+
+    const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height());
+    _cpp_nms.configure(&_scores_flattened /*scores_in*/,
+                       _all_proposals_to_use /*boxes_in,*/,
+                       nullptr /* batch_splits_in*/,
+                       scores_out /* scores_out*/,
+                       &_proposals_4_roi_values /*boxes_out*/,
+                       &_classes_nms_unused /*classes*/,
+                       nullptr /*batch_splits_out*/,
+                       &_keeps_nms_unused /*keeps*/,
+                       num_valid_proposals /* keeps_size*/,
+                       box_nms_info);
+
+    _keeps_nms_unused.allocator()->allocate();
+    _classes_nms_unused.allocator()->allocate();
+    _all_proposals_to_use->allocator()->allocate();
+    _scores_flattened.allocator()->allocate();
+
+    // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
+    _pad_kernel.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+    _proposals_4_roi_values.allocator()->allocate();
+}
+
+Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
+                                          const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(scores, DataLayout::NCHW, DataLayout::NHWC);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas);
+
+    const int num_anchors       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
+    const int feat_width        = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
+    const int feat_height       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
+    const int num_images        = scores->dimension(3);
+    const int total_num_anchors = num_anchors * feat_width * feat_height;
+    const int values_per_roi    = info.values_per_roi();
+
+    const bool is_qasymm8 = scores->data_type() == DataType::QASYMM8;
+
+    ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1);
+
+    if(is_qasymm8)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16);
+        const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform();
+        ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f);
+    }
+
+    TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
+
+    TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
+    TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
+    if(scores->data_layout() == DataLayout::NHWC)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPermuteKernel::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPermuteKernel::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+    }
+
+    TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(&deltas_permuted_info, &deltas_flattened_info));
+
+    TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
+    TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(&scores_permuted_info, &scores_flattened_info));
+
+    TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values;
+    TensorInfo  proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0));
+    if(is_qasymm8)
+    {
+        TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayerKernel::validate(&all_anchors_info, &all_anchors_f32_info));
+
+        TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayerKernel::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
+
+        TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransformKernel::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
+                                                                           BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayerKernel::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
+        proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized;
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
+                                                                           BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+    }
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } }));
+
+    if(num_valid_proposals->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32);
+    }
+
+    if(proposals->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2);
+        ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors));
+        if(is_qasymm8)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16);
+            const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform();
+            ARM_COMPUTE_RETURN_ERROR_ON(proposals_qinfo.scale != 0.125f);
+            ARM_COMPUTE_RETURN_ERROR_ON(proposals_qinfo.offset != 0);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(proposals, scores);
+        }
+    }
+
+    if(scores_out->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores_out, scores);
+    }
+
+    return Status{};
+}
+
+void NEGenerateProposalsLayer::run()
+{
+    // Acquire all the temporaries
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    // Compute all the anchors
+    NEScheduler::get().schedule(&_compute_anchors_kernel, Window::DimY);
+
+    // Transpose and reshape the inputs
+    if(!_is_nhwc)
+    {
+        NEScheduler::get().schedule(&_permute_deltas_kernel, Window::DimY);
+        NEScheduler::get().schedule(&_permute_scores_kernel, Window::DimY);
+    }
+
+    NEScheduler::get().schedule(&_flatten_deltas_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_flatten_scores_kernel, Window::DimY);
+
+    if(_is_qasymm8)
+    {
+        NEScheduler::get().schedule(&_dequantize_anchors, Window::DimY);
+        NEScheduler::get().schedule(&_dequantize_deltas, Window::DimY);
+    }
+
+    // Build the boxes
+    NEScheduler::get().schedule(&_bounding_box_kernel, Window::DimY);
+
+    if(_is_qasymm8)
+    {
+        NEScheduler::get().schedule(&_quantize_all_proposals, Window::DimY);
+    }
+
+    // Non maxima suppression
+    _cpp_nms.run();
+
+    // Add dummy batch indexes
+    NEScheduler::get().schedule(&_pad_kernel, Window::DimY);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
new file mode 100644
index 0000000..d7cb7de
--- /dev/null
+++ b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp

@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEInstanceNormalizationLayer::NEInstanceNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), _permute_input(), _permute_output(), _permuted_input(), _permuted_output()
+{
+}
+
+void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, float gamma, float beta, float epsilon)
+{
+    const DataLayout data_layout = input->info()->data_layout();
+
+    // Configure Kernels
+    _is_nchw = data_layout == DataLayout::NCHW;
+
+    if(!_is_nchw)
+    {
+        _memory_group.manage(&_permuted_input);
+        _memory_group.manage(&_permuted_output);
+
+        // Configure the function to transform the input tensor from NHWC -> NCHW
+        _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+        _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+        _normalization_kernel.configure(&_permuted_input, &_permuted_output, gamma, beta, epsilon);
+        _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+
+        _permute_output.configure(&_permuted_output, output != nullptr ? output : input, PermutationVector(2U, 0U, 1U));
+        _permuted_input.allocator()->allocate();
+        _permuted_output.allocator()->allocate();
+    }
+    else
+    {
+        _normalization_kernel.configure(input, output, gamma, beta, epsilon);
+    }
+}
+
+Status NEInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
+{
+    return NEInstanceNormalizationLayerKernel::validate(&input->clone()->set_data_layout(DataLayout::NCHW), &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon);
+}
+
+void NEInstanceNormalizationLayer::run()
+{
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    // Permute input
+    if(!_is_nchw)
+    {
+        _permute_input.run();
+    }
+
+    NEScheduler::get().schedule(&_normalization_kernel, Window::DimZ);
+
+    // Permute output
+    if(!_is_nchw)
+    {
+        _permute_output.run();
+    }
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
index 264cca0..cfd996b 100644
--- a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp

@@ -138,8 +138,7 @@
     const float multiplier        = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
     int         output_multiplier = 0;
     int         output_shift      = 0;
-
-    quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+    quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
 
     _memory_group.manage(&_output_lowp);
     _output_stage.configure(&_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift);
@@ -340,12 +339,13 @@
     input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
     weights_transposed.set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
 
-    // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12))
     const TensorInfo output_lowp(output_highp.tensor_shape(), 1, DataType::QSYMM16, qsymm_3);
 
-    const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
-    ARM_COMPUTE_UNUSED(multiplier);
-    ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f);
+    const float multiplier        = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
+    int         output_multiplier = 0;
+    int         output_shift      = 0;
+    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+
     // _output_stage
     ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&output_highp, &bias_concatenated, &output_lowp));
 
@@ -483,7 +483,7 @@
     _tanh_output_state.run();
     _mul3.run();
 
-    // Requantize output state from QSYMM16 to QASYMM16
+    // Requantize output state from QSYMM16 to QASYMM8
     _dequantize.run();
     _quantize.run();
 }

diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp
index c608edf..cf86240 100644
--- a/src/runtime/NEON/functions/NEPadLayer.cpp
+++ b/src/runtime/NEON/functions/NEPadLayer.cpp

@@ -34,33 +34,6 @@
 {
 namespace
 {
-TensorInfo get_expected_output_tensorinfo(const ITensorInfo &input, const PaddingList &paddings)
-{
-    const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input.tensor_shape(), paddings);
-    const TensorInfo  expected_output_info  = input.clone()->set_tensor_shape(expected_output_shape);
-    return expected_output_info;
-}
-
-Status validate_arguments(const ITensorInfo &input, ITensorInfo &output, const PaddingList &paddings)
-{
-    const TensorInfo expected_output_info = get_expected_output_tensorinfo(input, paddings);
-    auto_init_if_empty(output, expected_output_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output, &expected_output_info);
-
-    return Status{};
-}
-
-Coordinates get_subtensor_coords(const PaddingList &paddings)
-{
-    Coordinates coords;
-    for(unsigned int i = 0; i < paddings.size(); ++i)
-    {
-        coords.set(i, paddings[i].first);
-    }
-
-    return coords;
-}
-
 uint32_t last_padding_dimension(const PaddingList &padding)
 {
     int last_padding_dim = padding.size() - 1;
@@ -76,23 +49,13 @@
 } // namespace
 
 NEPadLayer::NEPadLayer()
-    : _copy_kernel(), _mode(), _padding(), _memset_kernel(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results(), _output_subtensor()
+    : _copy_kernel(), _pad_kernel(), _mode(), _padding(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results()
 {
 }
 
 void NEPadLayer::configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value)
 {
-    // Auto-init
-    auto_init_if_empty(*output->info(), get_expected_output_tensorinfo(*input->info(), padding));
-
-    // Create SubTensor (Can use sub-tensor as the kernels to be executed do not require padding)
-    _output_subtensor = SubTensor(output, input->info()->tensor_shape(), get_subtensor_coords(padding), true);
-
-    // Set the pages of the output to the specified value
-    _memset_kernel.configure(output, constant_value);
-
-    // Copy the input to the output
-    _copy_kernel.configure(input, &_output_subtensor);
+    _pad_kernel.configure(input, output, padding, constant_value, PaddingMode::CONSTANT);
 }
 
 void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *output)
@@ -253,11 +216,7 @@
     {
         case PaddingMode::CONSTANT:
         {
-            auto          output_clone = output->clone();
-            SubTensorInfo output_subtensor_info(output_clone.get(), input->tensor_shape(), get_subtensor_coords(padding), true);
-            ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output_clone, padding));
-            ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(input, &output_subtensor_info));
-            break;
+            return NEPadLayerKernel::validate(input, output, padding, constant_value, mode);
         }
         case PaddingMode::REFLECT:
         case PaddingMode::SYMMETRIC:
@@ -293,8 +252,7 @@
         {
             case PaddingMode::CONSTANT:
             {
-                NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
-                NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+                NEScheduler::get().schedule(&_pad_kernel, Window::DimZ);
                 break;
             }
             case PaddingMode::REFLECT:

diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
index 65873b1..4464978 100644
--- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp

@@ -27,8 +27,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 Status NEQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -46,3 +46,4 @@
     k->configure(input, output);
     _kernel = std::move(k);
 }
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp
index 9ca7ded..67f4064 100644
--- a/src/runtime/NEON/functions/NERNNLayer.cpp
+++ b/src/runtime/NEON/functions/NERNNLayer.cpp

@@ -34,8 +34,8 @@
 namespace arm_compute
 {
 NERNNLayer::NERNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(),
-      _is_prepared(false)
+    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected(memory_manager), _copy_kernel(), _fully_connected_out(), _gemm_output(),
+      _add_output(), _is_prepared(false)
 {
 }
 
@@ -81,7 +81,7 @@
 
     // Manage intermediate buffers and configure
     _memory_group.manage(&_fully_connected_out);
-    _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
+    _fully_connected.configure(input, weights, bias, &_fully_connected_out);
 
     _memory_group.manage(&_gemm_output);
     _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
@@ -106,7 +106,7 @@
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    _fully_connected_kernel.run();
+    _fully_connected.run();
 
     _gemm_state_f.run();
 
@@ -121,7 +121,7 @@
 {
     if(!_is_prepared)
     {
-        _fully_connected_kernel.prepare();
+        _fully_connected.prepare();
         _gemm_state_f.prepare();
 
         _is_prepared = true;

diff --git a/src/runtime/NEON/functions/NEROIAlignLayer.cpp b/src/runtime/NEON/functions/NEROIAlignLayer.cpp
new file mode 100644
index 0000000..b4e0a2f
--- /dev/null
+++ b/src/runtime/NEON/functions/NEROIAlignLayer.cpp

@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEROIAlignLayer.h"
+
+#include "arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+Status NEROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(NEROIAlignLayerKernel::validate(input, rois, output, pool_info));
+
+    return Status{};
+}
+
+void NEROIAlignLayer::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+{
+    // Configure ROI pooling kernel
+    auto k = arm_compute::support::cpp14::make_unique<NEROIAlignLayerKernel>();
+    k->configure(input, rois, output, pool_info);
+    _kernel = std::move(k);
+}
+
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index 0b145f0..10437f5 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp

@@ -24,80 +24,127 @@
 #include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
 
 #include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+namespace
+{
+inline TensorShape calculate_reduce_mean_shape(ITensor *input, const Coordinates &reduction_axis, bool keep_dims)
+{
+    const int   reduction_ops = reduction_axis.num_dimensions();
+    Coordinates axis_local    = reduction_axis;
+    const int   input_dims    = input->info()->num_dimensions();
+    convert_negative_axis(axis_local, input_dims);
+    TensorShape out_shape = input->info()->tensor_shape();
+    // Configure reshape layer if we want to drop the dimensions
+    if(!keep_dims)
+    {
+        // We have to sort the reduction axis vectors in order for remove_dimension
+        // to work properly
+        std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+        for(int i = 0; i < reduction_ops; ++i)
+        {
+            out_shape.remove_dimension(axis_local[i] - i);
+        }
+        return out_shape;
+    }
+    else
+    {
+        for(int i = 0; i < reduction_ops; ++i)
+        {
+            out_shape.set(axis_local[i], 1);
+        }
+        return out_shape;
+    }
+}
+} // namespace
 
 NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims()
 {
 }
 
-Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
 {
     ARM_COMPUTE_UNUSED(keep_dims);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
 
-    TensorShape        out_shape     = input->tensor_shape();
     const unsigned int reduction_ops = reduction_axis.num_dimensions();
     const int          input_dims    = input->num_dimensions();
     Coordinates        axis_local    = reduction_axis;
 
-    // Convert negative axis
-    for(unsigned int i = 0; i < reduction_ops; ++i)
+    for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
     {
-        axis_local[i] = wrap_around(axis_local[i], input_dims);
+        //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)).
+        ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions())));
+        ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions()));
     }
 
-    std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
-    for(unsigned int i = 0; i < reduction_ops; ++i)
+    if(output->tensor_shape().total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
-        ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
-        if(output->total_size() > 0 && keep_dims)
+        // Only validate if not using auto_init for the output tensor
+        TensorShape out_shape = input->tensor_shape();
+        // Validate output_shape only if not using auto_init
+        convert_negative_axis(axis_local, input_dims);
+        std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+        for(unsigned int i = 0; i < reduction_ops; ++i)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+            ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+            ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
+            if(output->total_size() > 0 && keep_dims)
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+            }
+            if(keep_dims)
+            {
+                out_shape.set(axis_local[i], 1);
+            }
+            else
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON(i > static_cast<unsigned int>(axis_local[i]));
+                const unsigned int remove_index = axis_local[i] - i;
+                ARM_COMPUTE_RETURN_ERROR_ON(remove_index >= out_shape.num_dimensions());
+                out_shape.remove_dimension(remove_index);
+            }
         }
-        if(keep_dims)
-        {
-            out_shape.set(axis_local[i], 1);
-        }
-        else
-        {
-            out_shape.remove_dimension(axis_local[i] - i);
-        }
+        const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
     }
-    const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
-
     return Status{};
 }
 
+Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+{
+    return validate_config(input, reduction_axis, keep_dims, output);
+}
+
 void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(NEReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
+    // Output auto inizialitation if not yet initialized
+    const TensorShape output_shape = calculate_reduce_mean_shape(input, reduction_axis, keep_dims);
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     _reduction_ops = reduction_axis.num_dimensions();
     _reduction_kernels.resize(_reduction_ops);
     _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
     _keep_dims = keep_dims;
 
-    Coordinates        axis_local    = reduction_axis;
-    const int          input_dims    = input->info()->num_dimensions();
-    const unsigned int reduction_ops = reduction_axis.num_dimensions();
+    Coordinates axis_local = reduction_axis;
+    const int   input_dims = input->info()->num_dimensions();
 
-    // Convert negative axis
-    for(unsigned int i = 0; i < reduction_ops; ++i)
-    {
-        axis_local[i] = wrap_around(axis_local[i], input_dims);
-    }
+    convert_negative_axis(axis_local, input_dims);
 
     // Perform reduction for every axis
-    for(unsigned int i = 0; i < _reduction_ops; ++i)
+    for(int i = 0; i < _reduction_ops; ++i)
     {
         TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
         out_shape.set(axis_local[i], 1);
@@ -116,7 +163,7 @@
     }
 
     // Allocate intermediate tensors
-    for(unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+    for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
     {
         _reduced_outs[i].allocator()->allocate();
     }
@@ -125,11 +172,10 @@
     if(!keep_dims)
     {
         TensorShape out_shape = input->info()->tensor_shape();
-
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
         std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
-        for(unsigned int i = 0; i < _reduction_ops; ++i)
+        for(int i = 0; i < _reduction_ops; ++i)
         {
             out_shape.remove_dimension(axis_local[i] - i);
         }
@@ -141,10 +187,9 @@
 void NEReduceMean::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
-
-    for(unsigned int i = 0; i < _reduction_ops; ++i)
+    for(auto &kernel : _reduction_kernels)
     {
-        _reduction_kernels[i].run();
+        kernel.run();
     }
 
     if(!_keep_dims)
@@ -152,3 +197,4 @@
         _reshape.run();
     }
 }
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index dc6cf59..397fe21 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
 
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 namespace arm_compute
@@ -52,25 +53,78 @@
 }
 } // namespace
 
-NEReductionOperation::NEReductionOperation()
-    : _reduction_kernel(), _fill_border_kernel(), _window_split(0), _reduction_axis()
+NEReductionOperation::NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager), _reduction_kernel(), _fill_border_kernel(), _reshape_kernel(), _output_internal(), _window_split(0), _reduction_axis(), _is_reshape_required(false)
 {
 }
 
-Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output, axis, op));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
+
+    const auto is_reshape_required = !keep_dims;
+
+    auto *output_internal = output;
+
+    TensorInfo info_before_reshape;
+
+    if(is_reshape_required)
+    {
+        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
+
+        auto shape_before_reshape = input->tensor_shape();
+        shape_before_reshape.set(axis, 1);
+
+        const auto input_num_channles = input->num_channels();
+        const auto input_qinfo        = input->quantization_info();
+        const auto is_arg_min_max     = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
+        const auto output_data_type   = is_arg_min_max ? DataType::S32 : output->data_type();
+
+        info_before_reshape.set_data_type(output_data_type).set_tensor_shape(shape_before_reshape).set_num_channels(input_num_channles).set_quantization_info(input_qinfo);
+
+        output_internal = &info_before_reshape;
+    }
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output_internal, axis, op));
+
+    if(is_reshape_required)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(output_internal, output));
+    }
 
     return Status{};
 }
 
-void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
+void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEReductionOperation::validate(input->info(), output->info(), axis, op));
+
+    _is_reshape_required = !keep_dims;
+
+    auto      *output_internal = output;
+    const auto is_arg_min_max  = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
+
+    if(_is_reshape_required)
+    {
+        const auto output_internal_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
+        const auto output_external_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+        const auto output_data_type      = is_arg_min_max ? DataType::S32 : input->info()->data_type();
+        const auto num_channels          = input->info()->num_channels();
+        const auto qinfo                 = input->info()->quantization_info();
+
+        _output_internal.allocator()->init(input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_internal_shape).reset_padding().set_is_resizable(true).set_num_channels(
+                                               num_channels).set_quantization_info(qinfo));
+        _memory_group.manage(&_output_internal);
+        output_internal = &_output_internal;
+        auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_external_shape).reset_padding().set_is_resizable(true));
+    }
+
+    ARM_COMPUTE_ERROR_THROW_ON(NEReductionOperation::validate(input->info(), output->info(), axis, op, keep_dims));
 
     // Configure reduction kernel
-    _reduction_kernel.configure(input, output, axis, op);
+    _reduction_kernel.configure(input, output_internal, axis, op);
     _window_split   = reduction_window_split_dimension(axis);
     _reduction_axis = axis;
 
@@ -150,7 +204,13 @@
             default:
                 ARM_COMPUTE_ERROR("Reduction Operation unsupported");
         }
-        _fill_border_kernel.configure(input, fill_border_size, BorderMode::CONSTANT, pixelValue);
+        _fill_border_kernel.configure(input, fill_border_size, (is_arg_min_max ? BorderMode::REPLICATE : BorderMode::CONSTANT), pixelValue);
+    }
+
+    if(_is_reshape_required)
+    {
+        _reshape_kernel.configure(output_internal, output);
+        _output_internal.allocator()->allocate();
     }
 }
 
@@ -161,5 +221,9 @@
         NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
     }
     NEScheduler::get().schedule(&_reduction_kernel, _window_split);
+    if(_is_reshape_required)
+    {
+        NEScheduler::get().schedule(&_reshape_kernel, Window::DimY);
+    }
 }
 } // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 79a9496..f530a87 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp

@@ -33,13 +33,15 @@
 
 namespace arm_compute
 {
-NESoftmaxLayer::NESoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+template <bool IS_LOG>
+NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _flat_or_reshape_kernel_ptr(nullptr), _fill_border_kernel(), _reshape_kernel(), _max(), _tmp(), _input_flattened(),
       _output_flattened(), _needs_flattening(false)
 {
 }
 
-void NESoftmaxLayer::configure_reshape_input_kernel(const ITensor *input, const ITensor *output, size_t axis)
+template <bool IS_LOG>
+void NESoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const ITensor *input, const ITensor *output, size_t axis)
 {
     // Flatten the input
     const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), axis);
@@ -68,11 +70,12 @@
     auto_init_if_empty(*output->info(), *input->info()->clone());
 }
 
-void NESoftmaxLayer::configure(ITensor *input, ITensor *output, float beta, size_t axis)
+template <bool IS_LOG>
+void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, float beta, size_t axis)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NESoftmaxLayer::validate(input->info(), output->info(), beta, axis));
+    ARM_COMPUTE_ERROR_THROW_ON(NESoftmaxLayerGeneric::validate(input->info(), output->info(), beta, axis));
 
     // We don't need flattening only in the case the input is 2D and axis is 1
     _needs_flattening = axis != 1;
@@ -138,7 +141,8 @@
     _tmp.allocator()->allocate();
 }
 
-Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
+template <bool IS_LOG>
+Status NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
 {
     // Perform validation step
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -173,12 +177,13 @@
     }
 
     ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DMaxKernel::validate(input, &tensor_info_max_sum));
-    ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DSoftmaxKernel::validate(&tensor_info_tmp, &tensor_info_max_sum, output, beta, &dont_care));
+    ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DSoftmaxKernel<IS_LOG>::validate(&tensor_info_tmp, &tensor_info_max_sum, output, beta, &dont_care));
 
     return Status{};
 }
 
-void NESoftmaxLayer::run()
+template <bool IS_LOG>
+void           NESoftmaxLayerGeneric<IS_LOG>::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
@@ -196,4 +201,8 @@
         NEScheduler::get().schedule(&_reshape_kernel, Window::DimY);
     }
 }
+
+template class NESoftmaxLayerGeneric<false>;
+template class NESoftmaxLayerGeneric<true>;
+
 } // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index e699ad1..6983c1c 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp

@@ -33,6 +33,7 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
 #include "support/ToolchainSupport.h"
 
+#include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp"
 #include "arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp"
 
 namespace arm_compute
@@ -232,6 +233,31 @@
     return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
 }
 
+inline bool fuse_function_supported(const ActivationLayerInfo &act_info)
+{
+    return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ||
+           act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
+}
+
+arm_gemm::Activation arm_gemm_activation_from_acl_activation(const ActivationLayerInfo &act_info)
+{
+        switch(act_info.activation())
+        {
+            case ActivationLayerInfo::ActivationFunction::RELU:
+            {
+                return arm_gemm::Activation(arm_gemm::Activation::Type::ReLU, act_info.a(), act_info.b());
+            }
+            case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+            {
+                return arm_gemm::Activation(arm_gemm::Activation::Type::BoundedReLU, act_info.a(), act_info.b());
+            }
+            default:
+            {
+                return arm_gemm::Activation(arm_gemm::Activation::Type::None);
+            }
+        }
+}
+
 } //namespace
 
 NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
@@ -257,6 +283,8 @@
     const Size2D kernel_size = Size2D(weights->info()->dimension(width_idx), weights->info()->dimension(height_idx));
     const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
 
+
+
     // Check if the Winograd configuration requires fast math
     if(!enable_fast_math)
     {
@@ -388,21 +416,15 @@
                                       * data_type_size;
 
     // Output storage
-    const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels,
-                                                                                        use_same_padding)
-                                       * data_type_size;
-    ;
-    const KernelShape kernel_shape({ out_channels, static_cast<int>(kernel_size.height), static_cast<int>(kernel_size.width), in_channels });
-    const int         kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
-
-    const int  output_matrix_stride = transform_output_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
-    const auto output_shape(transform_output_kernel->get_output_shape(kernel_shape, in_shape, use_padding_type));
-
-    const int input_matrix_stride = transform_input_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
+    const size_t output_storage_size  = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels) * data_type_size;
+    const int    kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(out_channels, in_channels);
+    const int    output_matrix_stride = transform_output_kernel->get_matrix_stride(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels);
+    const auto   output_shape         = transform_output_kernel->get_output_shape(in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME);
+    const int    input_matrix_stride  = transform_input_kernel->get_matrix_stride(in_shape.n_batches, in_channels, in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME);
 
     // Configure GEMM
-    const int tile_rows                = iceildiv(output_shape.n_rows, output_tile.height);
-    const int tile_cols                = iceildiv(output_shape.n_cols, output_tile.width);
+    const int tile_rows                = iceildiv(output_shape.first, output_tile.height);
+    const int tile_cols                = iceildiv(output_shape.second, output_tile.width);
     const int m                        = in_shape.n_batches * tile_rows * tile_cols;
     const int k                        = in_shape.n_channels;
     const int n                        = out_channels;
@@ -489,9 +511,19 @@
         _memory_group.manage(&_output_nhwc);
         output_to_use = &_output_nhwc;
     }
-    transform_output_kernel->configure(biases, &_output_transformed,
-                                       output_matrix_stride, output_to_use,
-                                       in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels, &_output_workspace);
+    const  arm_gemm::Activation activation = arm_gemm_activation_from_acl_activation(act_info);
+
+    transform_output_kernel->configure(biases,
+                                       &_output_transformed,
+                                       output_matrix_stride,
+                                       output_to_use,
+                                       in_shape.n_batches,
+                                       output_shape.first,
+                                       output_shape.second,
+                                       out_channels,
+                                       &_output_workspace,
+                                       activation);
+
     const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads);
     TensorInfo   output_workspace_info(TensorShape(output_workspace_size), 1, _output->info()->data_type());
     _output_workspace.allocator()->init(output_workspace_info);
@@ -510,7 +542,7 @@
     _transform_output_kernel  = std::move(transform_output_kernel);
 
     //Configure Activation Layer
-    _is_activationlayer_enabled = act_info.enabled();
+    _is_activationlayer_enabled = act_info.enabled() && ! fuse_function_supported(act_info);
     if(_is_activationlayer_enabled)
     {
         _activationlayer_function.configure(_output, nullptr, act_info);
@@ -546,7 +578,7 @@
         _permute_output.run();
     }
 
-    if(_is_activationlayer_enabled)
+    if(_is_activationlayer_enabled )
     {
         _activationlayer_function.run();
     }

diff --git a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
deleted file mode 100644
index ac809fa..0000000
--- a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
+++ /dev/null

@@ -1,426 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/assembly/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-#include "src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h"
-
-#include <atomic>
-#include <condition_variable>
-#include <mutex>
-
-namespace arm_compute
-{
-#ifndef NO_MULTI_THREADING
-class BufferManagerMultipleThreads final : public IBufferManager
-{
-public:
-    /** Number of buffers to ping pong between */
-    static constexpr unsigned int NUM_BUFFERS = 3;
-
-    explicit BufferManagerMultipleThreads(unsigned int max_num_users)
-        : _buffers(), _max_num_users(max_num_users)
-    {
-    }
-    unsigned int num_buffers() const override
-    {
-        return NUM_BUFFERS;
-    }
-    /* - Lock the requested index if it's free and return true if it needs reshaping.
-     * - Return false without acquiring the lock if the buffer at the index is already reshaped / being reshaped.
-     * - Block if the corresponding buffer for the given index is still being used by a different index.
-     */
-    bool lock_to_reshape_if_needed(unsigned int index) override
-    {
-        Buffer &buf = get_buffer_from_index(index);
-        while(true)
-        {
-            if(buf.index == index && buf.state != State::FREE)
-            {
-                //Another thread already is reshaping / has reshaped this block: nothing to do
-                return false;
-            }
-            else
-            {
-                std::unique_lock<std::mutex> lock(buf.mutex);
-                //If the buffer is free then lock it for reshaping:
-                if(buf.state == State::FREE)
-                {
-                    buf.index = index;
-                    buf.state = State::BEING_RESHAPED;
-                    return true;
-                }
-                // Check again just in case it changed while we were acquiring the lock:
-                if(buf.index == index)
-                {
-                    //Another thread is reshaping this block already, nothing to do
-                    return false;
-                }
-                // buf.index != index: Buffer still being used by another block, need to wait
-                buf.sem.wait(lock);
-            }
-        }
-    }
-    /* Mark the buffer at the given index as reshaped and release the lock acquired via lock_to_reshape_if_needed() */
-    void mark_as_reshaped(unsigned int index) override
-    {
-        Buffer &buf = get_buffer_from_index(index);
-        {
-            std::lock_guard<std::mutex> lock(buf.mutex);
-            buf.users = _max_num_users;
-            buf.state = State::IN_USE;
-        }
-        buf.sem.notify_all();
-    }
-
-    /* Block until the buffer at the given index is reshaped */
-    void wait_for_reshaping(unsigned int index) override
-    {
-        Buffer &buf = get_buffer_from_index(index);
-        ARM_COMPUTE_ERROR_ON(buf.index != index); // Should have blocked in lock_to_reshape_if_needed()
-        // Check if it's already ready to use:
-        if(buf.state == State::IN_USE)
-        {
-            return;
-        }
-        std::unique_lock<std::mutex> lock(buf.mutex);
-        //Double check it didn't change while we were acquiring the lock:
-        if(buf.state == State::IN_USE)
-        {
-            return;
-        }
-        buf.sem.wait(lock);
-    }
-    /* Mark the buffer at the given index as not used by this thread anymore.
-     * Once all the threads have called this method then the buffer is marked as free again.
-     */
-    void mark_as_unused(unsigned int index) override
-    {
-        Buffer &buf = get_buffer_from_index(index);
-        ARM_COMPUTE_ERROR_ON(buf.index != index); // Should have blocked in lock_to_reshape_if_needed()
-        if(--buf.users == 0)
-        {
-            std::unique_lock<std::mutex> lock(buf.mutex);
-            buf.state = State::FREE;
-            lock.unlock();
-            buf.sem.notify_all();
-        }
-    }
-
-private:
-    enum class State
-    {
-        FREE,
-        BEING_RESHAPED,
-        IN_USE
-    };
-    struct Buffer
-    {
-        unsigned int            index{};
-        std::atomic_uint        users{};
-        State                   state{ State::FREE };
-        std::mutex              mutex{};
-        std::condition_variable sem{};
-    };
-    std::array<struct Buffer, NUM_BUFFERS> _buffers;
-    Buffer &get_buffer_from_index(unsigned int index)
-    {
-        return _buffers[index % NUM_BUFFERS];
-    }
-    unsigned int _max_num_users;
-};
-#endif /* NO_MULTI_THREADING */
-
-class BufferManagerSingleThread : public IBufferManager
-{
-public:
-    unsigned int num_buffers() const override
-    {
-        return 1;
-    }
-    bool lock_to_reshape_if_needed(unsigned int index) override
-    {
-        ARM_COMPUTE_UNUSED(index);
-        return true;
-    }
-    void mark_as_reshaped(unsigned int index) override
-    {
-    }
-    void wait_for_reshaping(unsigned int index) override
-    {
-    }
-    void mark_as_unused(unsigned int index) override
-    {
-    }
-};
-
-NEGEMMInterleavedWrapper::NEGEMMInterleavedWrapper(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager))
-{
-}
-
-void NEGEMMInterleavedWrapper::run()
-{
-    prepare();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-    NEScheduler::get().run_tagged_workloads(_workloads, _tag.c_str());
-}
-
-void NEGEMMInterleavedWrapper::prepare()
-{
-    if(!_is_prepared)
-    {
-        if(_pretranspose_b)
-        {
-            _transformed_b.allocator()->allocate();
-            NEScheduler::get().schedule(_prepare_b.get(), Window::DimX);
-            _b->mark_as_unused();
-        }
-        else
-        {
-            _prepare_b->create_workloads(_b_workloads);
-        }
-        _transform_a->create_workloads(_a_workloads);
-        _matrix_multiply->create_workloads(_mm_workloads);
-
-        //Maximum number of workloads to create:
-        const unsigned int num_threads    = NEScheduler::get().num_threads();
-        const unsigned int max_iterations = std::max(num_threads, _num_windows);
-        //Maximum number of iterations the parameters allow:
-        const unsigned int num_iterations = _batch_window.num_iterations_total();
-        // Keep the smallest of the two:
-        const unsigned int num_windows  = std::min(num_iterations, max_iterations);
-        const TensorShape  window_shape = _batch_window.shape();
-        const unsigned int num_x_blocks = _block_walker.num_iterations(Window::DimX);
-
-        // Create a 1D window to dynamically split the batch window:
-        Window win_1D;
-        win_1D.set(0, Window::Dimension(0, num_iterations));
-
-        // Create one workload for each sub-window:
-        for(unsigned int w = 0; w < num_windows; w++)
-        {
-            Window            win          = win_1D.split_window(0, w, num_windows);
-            const Coordinates start_offset = index2coords(window_shape, win.x().start());
-            const Coordinates end_offset   = index2coords(window_shape, win.x().end() - 1);
-
-            if(_pretranspose_b)
-            {
-                auto workload = [start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
-                {
-                    //For each block of rows in "M"
-                    auto workload_mm = this->_mm_workloads.begin();
-                    for(auto &workload_a : this->_a_workloads)
-                    {
-                        // Transform one k_block from A:
-                        this->_transform_a->transform(workload_a, info, this->_batch_window, start_offset, end_offset);
-                        // Then perform the matrix multiplication for each x block along N:
-                        for(unsigned int i = 0; i < num_x_blocks; i++)
-                        {
-                            ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
-                            this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
-                        }
-                    }
-                };
-                _workloads.emplace_back(workload);
-            }
-            else
-            {
-                auto workload = [num_threads, start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
-                {
-                    //For each block of rows in "M"
-                    auto         workload_mm = this->_mm_workloads.begin();
-                    unsigned int workload_b  = 0;
-                    //If there is only one thread then only reshape the B blocks as you need them:
-                    unsigned int workload_b_next = num_threads == 1 ? this->_b_workloads.size() : 1;
-
-                    for(auto &workload_a : this->_a_workloads)
-                    {
-                        // Transform one k_block from A:
-                        this->_transform_a->transform(workload_a, info, this->_batch_window, start_offset, end_offset);
-                        // Then perform the matrix multiplication for each x block along N:
-                        for(unsigned int i = 0; i < num_x_blocks; i++)
-                        {
-                            ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
-                            if(workload_b_next < this->_b_workloads.size())
-                            {
-                                //Lock on BufferManager: need to run it ?
-                                if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b_next))
-                                {
-                                    this->_prepare_b->transform(this->_b_workloads[workload_b_next], info);
-                                    this->_buffer_manager->mark_as_reshaped(workload_b_next);
-                                }
-                                workload_b_next++;
-                            }
-                            ARM_COMPUTE_ERROR_ON(workload_b >= this->_b_workloads.size());
-                            // Run if needed or wait
-                            if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b))
-                            {
-                                this->_prepare_b->transform(this->_b_workloads[workload_b], info);
-                                this->_buffer_manager->mark_as_reshaped(workload_b);
-                            }
-                            this->_buffer_manager->wait_for_reshaping(workload_b);
-                            this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
-                            this->_buffer_manager->mark_as_unused(workload_b);
-                            workload_b++;
-                        }
-                    }
-                };
-                _workloads.emplace_back(workload);
-            }
-        }
-        if(!_pretranspose_b && num_windows > 1 && num_windows % num_threads != 0)
-        {
-            //Make sure the number of workloads is a multiple of the number of threads to avoid dead locks:
-            for(unsigned int leftover = num_windows % num_threads; leftover != num_threads; leftover++)
-            {
-                auto workload = [this](const ThreadInfo & info)
-                {
-                    unsigned int workload_b = 0;
-                    //If there is only one thread then only reshape the B blocks as you need them:
-                    unsigned int workload_b_next = 1;
-
-                    for(unsigned int iteration = 0; iteration < this->_mm_workloads.size(); iteration++)
-                    {
-                        if(workload_b_next < this->_b_workloads.size())
-                        {
-                            //Lock on BufferManager: need to run it ?
-                            if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b_next))
-                            {
-                                this->_prepare_b->transform(this->_b_workloads[workload_b_next], info);
-                                this->_buffer_manager->mark_as_reshaped(workload_b_next);
-                            }
-                            workload_b_next++;
-                        }
-                        ARM_COMPUTE_ERROR_ON(workload_b >= this->_b_workloads.size());
-                        // Run if needed or wait
-                        if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b))
-                        {
-                            this->_prepare_b->transform(this->_b_workloads[workload_b], info);
-                            this->_buffer_manager->mark_as_reshaped(workload_b);
-                        }
-                        this->_buffer_manager->wait_for_reshaping(workload_b);
-                        this->_buffer_manager->mark_as_unused(workload_b);
-                        workload_b++;
-                    }
-                };
-                _workloads.emplace_back(workload);
-            }
-        }
-
-        _is_prepared = true;
-    }
-}
-
-void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, const GEMMInfo &gemm_info)
-{
-    _params         = INEGEMMWrapperKernel::extract_parameters(a, b, c, gemm_info);
-    _a              = a;
-    _b              = b;
-    _c              = c;
-    _pretranspose_b = gemm_info.pretranpose_B();
-
-    const DataType     input_type  = a->info()->data_type();
-    const CPUInfo     &ci          = NEScheduler::get().cpu_info();
-    const unsigned int num_threads = NEScheduler::get().num_threads();
-
-    const arm_gemm::KernelDescription gemm_kernel_info = get_gemm_info(input_type, ci, num_threads, _params, alpha, beta, _pretranspose_b);
-    ARM_COMPUTE_ERROR_ON(gemm_kernel_info.method != arm_gemm::GemmMethod::GEMM_INTERLEAVED);
-
-    // Forcing 128-byte alignment (required by 32-bit kernels)
-    const unsigned int alignment = 128;
-    _transformed_b.allocator()->init(TensorInfo{}, alignment);
-    _tmp_c.allocator()->init(TensorInfo{}, alignment);
-    _tag = "NEGEMMInterleaved_" + gemm_kernel_info.name;
-
-    // Get strategy
-    std::unique_ptr<detail::IInterleavedStrategy> strategy = detail::create_strategy(gemm_kernel_info.name);
-    _num_windows                                           = iceildiv(_params.M, strategy->out_height()) * _params.batches;
-    ARM_COMPUTE_ERROR_ON(strategy == nullptr);
-
-    if(!_pretranspose_b)
-    {
-        _block_sizes = strategy->calculate_block_sizes_for_strategy(ci, _params);
-        _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
-        _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
-        // If the execution is single threaded or has only one window then the buffer manager only needs 1 buffer else we will use NUM_BUFFERS buffers and ping pong between them:
-        const unsigned int num_iterations = _batch_window.num_iterations_total();
-        if(NEScheduler::get().num_threads() == 1 || num_iterations == 1)
-        {
-            _buffer_manager = support::cpp14::make_unique<BufferManagerSingleThread>();
-        }
-        else
-        {
-#ifdef NO_MULTI_THREADING
-            ARM_COMPUTE_ERROR("Can't have more than 1 buffer without multiple threads");
-#else  /* NO_MULTI_THREADING */
-            _buffer_manager = support::cpp14::make_unique<BufferManagerMultipleThreads>(NEScheduler::get().num_threads());
-#endif /* NO_MULTI_THREADING */
-        }
-        // If B is transposed at every iteration then transformed_B can be managed:
-        _memory_group.manage(&_transformed_b);
-        auto_init_if_empty(*_transformed_b.info(), _b->info()->clone()->set_tensor_shape(TensorShape(_block_sizes.x_block * _block_sizes.k_block, _buffer_manager->num_buffers())));
-    }
-    else
-    {
-        _tag += "_preB";
-    }
-
-    _prepare_b = strategy->instantiate_prepareB(b, &_transformed_b, _params, ci);
-    ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr);
-
-    if(_pretranspose_b)
-    {
-        _block_sizes = _prepare_b->block_sizes();
-        _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
-        _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
-    }
-
-    _block_walker.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_params.N, _block_sizes.x_block), _block_sizes.x_block));
-    _block_walker.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(_params.K, _block_sizes.k_block), _block_sizes.k_block));
-    _block_walker.set(Window::DimZ, Window::Dimension(0, _params.multis));
-
-    _transformed_a.allocator()->init(TensorInfo(TensorShape{ _block_sizes.k_block, _block_sizes.m_round, _params.batches }, 1, input_type), alignment);
-    _memory_group.manage(&_transformed_a);
-    _memory_group.manage(&_tmp_c);
-
-    _transform_a     = strategy->instantiate_transformA(_a, &_transformed_a, _block_walker, _params, gemm_info);
-    _matrix_multiply = strategy->instantiate_matrix_multiply(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, alpha, beta, gemm_info, num_threads);
-    ARM_COMPUTE_ERROR_ON(_transform_a == nullptr);
-    ARM_COMPUTE_ERROR_ON(_matrix_multiply == nullptr);
-
-    _transformed_a.allocator()->allocate();
-    _tmp_c.allocator()->allocate();
-    if(!_pretranspose_b)
-    {
-        _transformed_b.allocator()->allocate();
-    }
-}
-} // namespace arm_compute
commit	0e205f7e1083bf26e6ffc33d96005999b541b8f9	[log] [tgz]
author	Jenkins <bsgcomp@arm.com>	Thu Nov 28 16:53:35 2019 +0000
committer	Jenkins <bsgcomp@arm.com>	Thu Nov 28 16:53:35 2019 +0000
tree	8e204985bbc0a9588f67128cafe49c0419c6d3f9
parent	975dfe175e3d7c62c27598b1c0e8e77ed90df463 [diff]