arm_compute v19.05

commit: 4ba87dbdc3b22220eba4a792c1f5c87e7a88c7af [log] [tgz]
author: Jenkins <bsgcomp@arm.com> Thu May 23 17:11:51 2019 +0100
committer: Jenkins <bsgcomp@arm.com> Thu May 23 17:11:51 2019 +0100
tree: f0364d64c78ffa0b0a86e85457748fbdccf5eb07
parent: 29f6788cee8881c5523a042a0ac9b0131d993768 [diff]
diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp
index c5d42b1..1323bb3 100644
--- a/src/runtime/BlobLifetimeManager.cpp
+++ b/src/runtime/BlobLifetimeManager.cpp

@@ -66,7 +66,7 @@
     std::vector<BlobInfo> group_sizes;
     std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes), [](const Blob & b)
     {
-        return BlobInfo(b.max_size, b.max_alignment);
+        return BlobInfo{ b.max_size, b.max_alignment };
     });
 
     // Update blob sizes
@@ -75,7 +75,7 @@
     group_sizes.resize(max_size);
     std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), [](BlobInfo lhs, BlobInfo rhs)
     {
-        return BlobInfo(std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment));
+        return BlobInfo{ std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment) };
     });
 
     // Calculate group mappings

diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp
index 533e6fa..8bc7b8e 100644
--- a/src/runtime/CL/CLHelpers.cpp
+++ b/src/runtime/CL/CLHelpers.cpp

@@ -47,7 +47,7 @@
  * @return A pointer to the context properties which can be used to create an opencl context
  */
 
-void initialise_context_properties(const cl::Platform &platform, const cl::Device &device, cl_context_properties prop[7])
+void initialise_context_properties(const cl::Platform &platform, const cl::Device &device, std::array<cl_context_properties, 7> &prop)
 {
     ARM_COMPUTE_UNUSED(device);
 #if defined(ARM_COMPUTE_ASSERTS_ENABLED)
@@ -55,7 +55,7 @@
     if(arm_compute::device_supports_extension(device, "cl_arm_printf"))
     {
         // Create a cl_context with a printf_callback and user specified buffer size.
-        cl_context_properties properties_printf[] =
+        std::array<cl_context_properties, 7> properties_printf =
         {
             CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
             // Enable a printf callback function for this context.
@@ -65,17 +65,17 @@
             CL_PRINTF_BUFFERSIZE_ARM, 0x1000,
             0
         };
-        std::copy_n(properties_printf, 7, prop);
+        prop = properties_printf;
     }
     else
 #endif // defined(ARM_COMPUTE_ASSERTS_ENABLED)
     {
-        cl_context_properties properties[] =
+        std::array<cl_context_properties, 3> properties =
         {
             CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
             0
         };
-        std::copy_n(properties, 3, prop);
+        std::copy(properties.begin(), properties.end(), prop.begin());
     };
 }
 } //namespace
@@ -94,11 +94,11 @@
     std::vector<cl::Device> platform_devices;
     p.getDevices(CL_DEVICE_TYPE_DEFAULT, &platform_devices);
     ARM_COMPUTE_ERROR_ON_MSG(platform_devices.size() == 0, "Couldn't find any OpenCL device");
-    device                              = platform_devices[0];
-    cl_int                err           = CL_SUCCESS;
-    cl_context_properties properties[7] = { 0, 0, 0, 0, 0, 0, 0 };
+    device     = platform_devices[0];
+    cl_int err = CL_SUCCESS;
+    std::array<cl_context_properties, 7> properties = { 0, 0, 0, 0, 0, 0, 0 };
     initialise_context_properties(p, device, properties);
-    cl::Context cl_context = cl::Context(device, properties, nullptr, nullptr, &err);
+    cl::Context cl_context = cl::Context(device, properties.data(), nullptr, nullptr, &err);
     ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context");
     return std::make_tuple(cl_context, device, err);
 }

diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp
index 5bea85c..557378b 100644
--- a/src/runtime/CL/CLMemory.cpp
+++ b/src/runtime/CL/CLMemory.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,8 +33,8 @@
 {
 }
 
-CLMemory::CLMemory(std::shared_ptr<ICLMemoryRegion> memory)
-    : _region(nullptr), _region_owned(std::move(memory))
+CLMemory::CLMemory(const std::shared_ptr<ICLMemoryRegion> &memory)
+    : _region(nullptr), _region_owned(memory)
 {
     _region_owned = memory;
     _region       = _region_owned.get();

diff --git a/src/runtime/CL/CLMultiHOG.cpp b/src/runtime/CL/CLMultiHOG.cpp
index 88d45ac..2577ec0 100644
--- a/src/runtime/CL/CLMultiHOG.cpp
+++ b/src/runtime/CL/CLMultiHOG.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,8 +30,9 @@
 using namespace arm_compute;
 
 CLMultiHOG::CLMultiHOG(size_t num_models)
-    : _num_models(num_models), _model(arm_compute::support::cpp14::make_unique<CLHOG[]>(_num_models))
+    : _num_models(num_models), _model()
 {
+    _model.resize(_num_models);
 }
 
 size_t CLMultiHOG::num_models() const
@@ -42,11 +43,11 @@
 ICLHOG *CLMultiHOG::cl_model(size_t index)
 {
     ARM_COMPUTE_ERROR_ON(index >= _num_models);
-    return (_model.get() + index);
+    return (&_model[index]);
 }
 
 const ICLHOG *CLMultiHOG::cl_model(size_t index) const
 {
     ARM_COMPUTE_ERROR_ON(index >= _num_models);
-    return (_model.get() + index);
+    return (&_model[index]);
 }
\ No newline at end of file

diff --git a/src/runtime/CL/CLPyramid.cpp b/src/runtime/CL/CLPyramid.cpp
index 865f389..6d5dba0 100644
--- a/src/runtime/CL/CLPyramid.cpp
+++ b/src/runtime/CL/CLPyramid.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,7 +35,7 @@
 using namespace arm_compute;
 
 CLPyramid::CLPyramid()
-    : _info(), _pyramid(nullptr)
+    : _info(), _pyramid()
 {
 }
 
@@ -51,8 +51,8 @@
 
 void CLPyramid::internal_init(const PyramidInfo &info, bool auto_padding)
 {
-    _info    = info;
-    _pyramid = arm_compute::support::cpp14::make_unique<CLTensor[]>(_info.num_levels());
+    _info = info;
+    _pyramid.resize(_info.num_levels());
 
     size_t      w            = _info.width();
     size_t      h            = _info.height();
@@ -109,11 +109,9 @@
 
 void CLPyramid::allocate()
 {
-    ARM_COMPUTE_ERROR_ON(_pyramid == nullptr);
-
     for(size_t i = 0; i < _info.num_levels(); ++i)
     {
-        (_pyramid.get() + i)->allocator()->allocate();
+        _pyramid[i].allocator()->allocate();
     }
 }
 
@@ -126,5 +124,5 @@
 {
     ARM_COMPUTE_ERROR_ON(index >= _info.num_levels());
 
-    return (_pyramid.get() + index);
+    return &_pyramid[index];
 }

diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index 0307498..101e4f1 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,7 @@
 
 namespace
 {
-std::unique_ptr<ICLMemoryRegion> allocate_region(cl::Context context, size_t size, cl_uint alignment)
+std::unique_ptr<ICLMemoryRegion> allocate_region(const cl::Context &context, size_t size, cl_uint alignment)
 {
     // Try fine-grain SVM
     std::unique_ptr<ICLMemoryRegion> region = support::cpp14::make_unique<CLFineSVMMemoryRegion>(context,
@@ -101,10 +101,10 @@
     info().set_is_resizable(true);
 }
 
-arm_compute::Status CLTensorAllocator::import_memory(cl::Buffer buffer)
+Status CLTensorAllocator::import_memory(cl::Buffer buffer)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(buffer.get() == nullptr);
-    ARM_COMPUTE_RETURN_ERROR_ON(buffer.getInfo<CL_MEM_SIZE>() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(buffer.getInfo<CL_MEM_SIZE>() < info().total_size());
     ARM_COMPUTE_RETURN_ERROR_ON(buffer.getInfo<CL_MEM_CONTEXT>().get() != CLScheduler::get().context().get());
     ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr);
 

diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index a262d6b..2c3f9ce 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/CLTuner.h"
+#include "arm_compute/runtime/CL/tuners/CLLWSList.h"
 
 #include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Error.h"
@@ -31,42 +32,13 @@
 #include <fstream>
 #include <iostream>
 #include <limits>
+#include <memory>
 #include <string>
 
 namespace arm_compute
 {
-namespace
-{
-/** Utility function used to initialize the LWS values to test.
- *  Only the LWS values which are power of 2 or satisfy the modulo conditions with GWS are taken into account by the CLTuner
- *
- * @param[in, out] lws         Vector of LWS to test for a specific dimension
- * @param[in]      gws         Size of the GWS
- * @param[in]      lws_max     Max LKWS value allowed to be tested
- * @param[in]      mod_let_one True if the results of the modulo operation between gws and the lws can be less than one.
- */
-void initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one)
-{
-    lws.push_back(1);
-
-    for(unsigned int i = 2; i <= lws_max; ++i)
-    {
-        // Power of two condition
-        const bool is_power_of_two = (i & (i - 1)) == 0;
-
-        // Condition for the module accordingly with the mod_let_one flag
-        const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0;
-
-        if(mod_cond || is_power_of_two)
-        {
-            lws.push_back(i);
-        }
-    }
-}
-} // namespace
-
 CLTuner::CLTuner(bool tune_new_kernels)
-    : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels)
+    : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuner_mode(CLTunerMode::NORMAL)
 {
 }
 
@@ -88,6 +60,15 @@
     return _tune_new_kernels;
 }
 
+void CLTuner::set_tuner_mode(CLTunerMode mode)
+{
+    _tuner_mode = mode;
+}
+CLTunerMode CLTuner::get_tuner_mode() const
+{
+    return _tuner_mode;
+}
+
 void CLTuner::tune_kernel_static(ICLKernel &kernel)
 {
     ARM_COMPUTE_UNUSED(kernel);
@@ -182,61 +163,53 @@
     };
     CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
 
-    cl_ulong min_exec_time = std::numeric_limits<cl_ulong>::max();
+    cl::NDRange gws = ICLKernel::gws_from_window(kernel.window());
 
-    cl::NDRange gws     = ICLKernel::gws_from_window(kernel.window());
+    // Run the kernel with default lws to be used as baseline
+    kernel.run(kernel.window(), queue_profiler);
+
+    queue_profiler.finish();
+
+    const cl_ulong start         = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+    const cl_ulong end           = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+    cl_ulong       min_exec_time = end - start;
+    _kernel_event                = nullptr;
+
     cl::NDRange opt_lws = cl::NullRange;
 
-    const unsigned int lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 64u);
-    const unsigned int lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 32u);
-    const unsigned int lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 32u);
-
-    std::vector<unsigned int> lws_x;
-    std::vector<unsigned int> lws_y;
-    std::vector<unsigned int> lws_z;
-
-    // Initialize the LWS values to test
-    initialize_lws_values(lws_x, gws[0], lws_x_max, gws[2] > 16);
-    initialize_lws_values(lws_y, gws[1], lws_y_max, gws[2] > 16);
-    initialize_lws_values(lws_z, gws[2], lws_z_max, false);
-
-    for(const auto &z : lws_z)
+    //Construct the list of LWS values to be tested based on the tuner mode.
+    auto lws_list = cl_tuner::CLLWSListFactory::get_lws_list(_tuner_mode, gws);
+    for(size_t i = 0; i < lws_list->size(); ++i)
     {
-        for(const auto &y : lws_y)
+        cl::NDRange lws_test    = (*lws_list)[i];
+        auto        x           = lws_test[0];
+        auto        y           = lws_test[1];
+        auto        z           = lws_test[2];
+        const bool  invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1);
+
+        if(invalid_lws)
         {
-            for(const auto &x : lws_x)
-            {
-                cl::NDRange lws_test = cl::NDRange(x, y, z);
+            continue;
+        }
 
-                bool invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1);
+        //Set the Local-Workgroup-Size
+        kernel.set_lws_hint(lws_test);
 
-                invalid_lws = invalid_lws || (x > gws[0]) || (y > gws[1]) || (z > gws[2]);
+        // Run the kernel
+        kernel.run(kernel.window(), queue_profiler);
 
-                if(invalid_lws)
-                {
-                    continue;
-                }
+        queue_profiler.finish();
 
-                //Set the Local-Workgroup-Size
-                kernel.set_lws_hint(lws_test);
+        const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+        const cl_ulong end   = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+        const cl_ulong diff  = end - start;
+        _kernel_event        = nullptr;
 
-                // Run the kernel
-                kernel.run(kernel.window(), queue_profiler);
-
-                queue_profiler.finish();
-
-                const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
-                const cl_ulong end   = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
-                const cl_ulong diff  = end - start;
-                _kernel_event        = nullptr;
-
-                // Check the execution time
-                if(diff < min_exec_time)
-                {
-                    min_exec_time = diff;
-                    opt_lws       = cl::NDRange(x, y, z);
-                }
-            }
+        // Check the execution time
+        if(diff < min_exec_time)
+        {
+            min_exec_time = diff;
+            opt_lws       = cl::NDRange(x, y, z);
         }
     }
 
@@ -301,7 +274,7 @@
     std::ofstream fs;
     fs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
     fs.open(filename, std::ios::out);
-    for(auto kernel_data : _lws_table)
+    for(auto const &kernel_data : _lws_table)
     {
         fs << kernel_data.first << ";" << kernel_data.second[0] << ";" << kernel_data.second[1] << ";" << kernel_data.second[2] << std::endl;
     }

diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
index 84e8709..4c7458d 100644
--- a/src/runtime/CL/functions/CLCannyEdge.cpp
+++ b/src/runtime/CL/functions/CLCannyEdge.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -177,7 +177,7 @@
 
 void CLCannyEdge::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run sobel
     _sobel->run();
@@ -199,6 +199,4 @@
     _l1_list_counter.clear(CLScheduler::get().queue());
     _l1_stack.clear(CLScheduler::get().queue());
     CLScheduler::get().enqueue(_edge_trace, true);
-
-    _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index 018c674..b8224d2 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,9 @@
  */
 #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
 
+#include "arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
 #include "arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h"
 
@@ -35,56 +38,168 @@
 namespace arm_compute
 {
 CLConcatenateLayer::CLConcatenateLayer()
-    : _concat_function(nullptr)
+    : _concat_kernels(),
+      _num_inputs(0),
+      _axis(Window::DimX)
 {
 }
 
-void CLConcatenateLayer::configure(const std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, DataLayoutDimension axis)
+void CLConcatenateLayer::configure(const std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
 {
     ARM_COMPUTE_ERROR_ON(output == nullptr);
+    _axis       = axis;
+    _num_inputs = inputs_vector.size();
 
-    switch(get_data_layout_dimension_index(output->info()->data_layout(), axis))
+    std::vector<ITensorInfo *> inputs_vector_info(inputs_vector.size());
+    std::transform(inputs_vector.begin(), inputs_vector.end(), inputs_vector_info.begin(), [](ICLTensor * t)
     {
-        case 0:
+        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+        return t->info();
+    });
+    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, _axis);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
+    ARM_COMPUTE_ERROR_THROW_ON(CLConcatenateLayer::validate(inputs_vector_info, output->info(), axis));
+
+    unsigned int offset = 0;
+    switch(_axis)
+    {
+        case Window::DimX:
         {
-            auto func = support::cpp14::make_unique<CLWidthConcatenateLayer>();
-            func->configure(inputs_vector, output);
-            _concat_function = std::move(func);
+            switch(_num_inputs)
+            {
+                case 2:
+                {
+                    // Configure WidthConcatenate2Tensors kernel
+                    auto kernel = support::cpp14::make_unique<CLWidthConcatenate2TensorsKernel>();
+                    kernel->configure(inputs_vector.at(0), inputs_vector.at(1), output);
+                    _concat_kernels.emplace_back(std::move(kernel));
+                    break;
+                }
+                case 4:
+                {
+                    // Configure WidthConcatenate4Tensors kernel
+                    auto kernel = support::cpp14::make_unique<CLWidthConcatenate4TensorsKernel>();
+                    kernel->configure(inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output);
+                    _concat_kernels.emplace_back(std::move(kernel));
+                    break;
+                }
+                default:
+                {
+                    // Configure generic case WidthConcatenate kernels
+                    for(unsigned int i = 0; i < _num_inputs; ++i)
+                    {
+                        auto kernel = support::cpp14::make_unique<CLWidthConcatenateLayerKernel>();
+                        kernel->configure(inputs_vector.at(i), offset, output);
+                        offset += inputs_vector.at(i)->info()->dimension(_axis);
+                        _concat_kernels.emplace_back(std::move(kernel));
+                    }
+                    break;
+                }
+            }
             break;
         }
-        case 2:
+        case Window::DimY:
         {
-            auto func = support::cpp14::make_unique<CLDepthConcatenateLayer>();
-            func->configure(inputs_vector, output);
-            _concat_function = std::move(func);
+            for(unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                auto kernel = support::cpp14::make_unique<CLHeightConcatenateLayerKernel>();
+                kernel->configure(inputs_vector.at(i), offset, output);
+                offset += inputs_vector.at(i)->info()->dimension(_axis);
+                _concat_kernels.emplace_back(std::move(kernel));
+            }
+            break;
+        }
+        case Window::DimZ:
+        {
+            for(unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                auto kernel = support::cpp14::make_unique<CLDepthConcatenateLayerKernel>();
+                kernel->configure(inputs_vector.at(i), offset, output);
+                offset += inputs_vector.at(i)->info()->dimension(_axis);
+                _concat_kernels.emplace_back(std::move(kernel));
+            }
             break;
         }
         default:
-            ARM_COMPUTE_ERROR("Concatenation is supported across width and depth only!");
+            ARM_COMPUTE_ERROR("Axis not supported");
     }
 }
 
-Status CLConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, DataLayoutDimension axis)
+Status CLConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(output == nullptr);
+    const unsigned int num_inputs = inputs_vector.size();
 
-    switch(get_data_layout_dimension_index(output->data_layout(), axis))
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
+
+    unsigned int offset = 0;
+    switch(axis)
     {
-        case 0:
-            ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayer::validate(inputs_vector, output));
+        case Window::DimX:
+        {
+            switch(num_inputs)
+            {
+                case 2:
+                    // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
+                    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1]);
+                    ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(inputs_vector[0], inputs_vector[1], output));
+                    break;
+                case 4:
+                    // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
+                    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3]);
+                    ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate4TensorsKernel::validate(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3], output));
+                    break;
+                default:
+                    // Validate generic case of WidthConcatenate kernel
+                    for(const auto &input : inputs_vector)
+                    {
+                        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+                        ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, offset, output));
+                        offset += input->dimension(axis);
+                    }
+                    break;
+            }
             break;
-        case 2:
-            ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConcatenateLayer::validate(inputs_vector, output));
+        }
+        case Window::DimY:
+        {
+            for(const auto &input : inputs_vector)
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(CLHeightConcatenateLayerKernel::validate(input, offset, output));
+                offset += input->dimension(axis);
+            }
             break;
+        }
+        case Window::DimZ:
+        {
+            for(const auto &input : inputs_vector)
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConcatenateLayerKernel::validate(input, offset, output));
+                offset += input->dimension(axis);
+            }
+            break;
+        }
         default:
-            ARM_COMPUTE_RETURN_ERROR_MSG("Concatenation is supported across width and depth only!");
+            ARM_COMPUTE_ERROR("Axis not supported");
     }
+
+    if(output->total_size() != 0)
+    {
+        TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, axis);
+        ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+    }
+
     return Status{};
 }
 
 void CLConcatenateLayer::run()
 {
-    ARM_COMPUTE_ERROR_ON(_concat_function == nullptr);
-    _concat_function->run();
+    for(auto &kernel : _concat_kernels)
+    {
+        CLScheduler::get().enqueue(*kernel, true);
+    }
 }
 } // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp
index 0131801..f09585e 100644
--- a/src/runtime/CL/functions/CLConvolution.cpp
+++ b/src/runtime/CL/functions/CLConvolution.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,13 +58,13 @@
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON(conv == nullptr);
-    int16_t conv_col[matrix_size];
-    int16_t conv_row[matrix_size];
-    _is_separable = separate_matrix(conv, conv_col, conv_row, matrix_size);
+    std::array<int16_t, matrix_size> conv_col{ 0 };
+    std::array<int16_t, matrix_size> conv_row{ 0 };
+    _is_separable = separate_matrix(conv, conv_col.data(), conv_row.data(), matrix_size);
 
     if(_is_separable)
     {
-        std::pair<DataType, DataType> type_pair = data_type_for_convolution(conv_col, conv_row, matrix_size);
+        std::pair<DataType, DataType> type_pair = data_type_for_convolution(conv_col.data(), conv_row.data(), matrix_size);
         _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, type_pair.first));
 
         // Manage intermediate buffers
@@ -75,8 +75,8 @@
             scale = calculate_matrix_scale(conv, matrix_size);
         }
 
-        _kernel_hor.configure(input, &_tmp, conv_row, border_mode == BorderMode::UNDEFINED);
-        _kernel_vert.configure(&_tmp, output, conv_col, scale, border_mode == BorderMode::UNDEFINED, type_pair.second);
+        _kernel_hor.configure(input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
+        _kernel_vert.configure(&_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED, type_pair.second);
         _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
 
         // Allocate intermediate buffer
@@ -96,12 +96,10 @@
 
     if(_is_separable)
     {
-        _memory_group.acquire();
+        MemoryGroupResourceScope scope_mg(_memory_group);
 
         CLScheduler::get().enqueue(_kernel_hor, false);
         CLScheduler::get().enqueue(_kernel_vert);
-
-        _memory_group.release();
     }
     else
     {

diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 0014e71..165d523 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -75,6 +75,13 @@
             _function = std::move(f);
             break;
         }
+        case ConvolutionMethod::FFT:
+        {
+            auto f = arm_compute::support::cpp14::make_unique<CLFFTConvolutionLayer>(_memory_manager);
+            f->configure(input, weights, biases, output, conv_info, act_info);
+            _function = std::move(f);
+            break;
+        }
         default:
             ARM_COMPUTE_ERROR("Not supported.");
             break;
@@ -111,6 +118,12 @@
             ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups));
             break;
         }
+        case ConvolutionMethod::FFT:
+        {
+            // Validate FFT-based convolution layer
+            ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info));
+            break;
+        }
         default:
             ARM_COMPUTE_ERROR("Not supported.");
             break;
@@ -169,12 +182,20 @@
         return (*found).second;
     }
 
-    if(dilation != Size2D(1U, 1U) || (input->dimension(idx_c) < 16))
+    if(dilation != Size2D(1U, 1U))
     {
         return ConvolutionMethod::GEMM;
     }
     else
     {
+        if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) > output->dimension(idx_c)) && ( CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
+        {
+            return ConvolutionMethod::FFT;
+        }
+        if (input->dimension(idx_c) < 16)
+        {
+            return ConvolutionMethod::GEMM;
+        }
         return bool(CLWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
     }
 }

diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp
new file mode 100644
index 0000000..b22809e
--- /dev/null
+++ b/src/runtime/CL/functions/CLCropResize.cpp

@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/CLHelpers.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/functions/CLCropResize.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace
+{
+inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTensor *box_ind, ICLTensor *output, uint32_t crop_box_ind, Coordinates &start, Coordinates &end, uint32_t &batch_index)
+{
+    batch_index = *(reinterpret_cast<int32_t *>(box_ind->ptr_to_element(Coordinates(crop_box_ind))));
+
+    // _crop_box_ind is used to index crop_boxes and retrieve the appropriate crop box.
+    // The crop box is specified by normalized coordinates [y0, x0, y1, x1].
+    const float x0 = *reinterpret_cast<const float *>(crop_boxes->ptr_to_element(Coordinates(1, crop_box_ind)));
+    const float y0 = *reinterpret_cast<const float *>(crop_boxes->ptr_to_element(Coordinates(0, crop_box_ind)));
+    const float x1 = *reinterpret_cast<const float *>(crop_boxes->ptr_to_element(Coordinates(3, crop_box_ind)));
+    const float y1 = *reinterpret_cast<const float *>(crop_boxes->ptr_to_element(Coordinates(2, crop_box_ind)));
+    // The normalized coordinates are scaled to retrieve the floating point image coordinates which are rounded to integers.
+    start = Coordinates(std::floor(x0 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
+                        std::floor(y0 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
+    end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
+                      std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
+    const TensorShape out_shape(input->info()->tensor_shape()[0], abs(end[0] - start[0]) + 1, abs(end[1] - start[1]) + 1);
+    output->info()->set_tensor_shape(out_shape);
+}
+
+inline void run_crop(const ICLTensor *input, ICLTensor *output, uint32_t batch_index, Coordinates start, Coordinates end, float extrapolation_value)
+{
+    bool is_width_flipped  = end[0] < start[0];
+    bool is_height_flipped = end[1] < start[1];
+    /** The number of rows out of bounds at the start and end of output. */
+    std::array<int32_t, 2> rows_out_of_bounds{ 0 };
+    /** The number of columns out of bounds at the start and end of output. */
+    std::array<int32_t, 2> cols_out_of_bounds{ 0 };
+    if(is_height_flipped)
+    {
+        rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(input->info()->dimension(2)) ? std::min(start[1] - input->info()->dimension(2) + 1, output->info()->dimension(2)) : 0;
+        rows_out_of_bounds[1] = end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(output->info()->dimension(2))) : 0;
+    }
+    else
+    {
+        rows_out_of_bounds[0] = start[1] < 0 ? std::min(-start[1], static_cast<int32_t>(output->info()->dimension(2))) : 0;
+        rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(input->info()->dimension(2)) ? std::min(end[1] - input->info()->dimension(2) + 1, output->info()->dimension(2)) : 0;
+    }
+    if(is_width_flipped)
+    {
+        cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(input->info()->dimension(1)) ? std::min(start[0] - input->info()->dimension(1) + 1, output->info()->dimension(1)) : 0;
+        cols_out_of_bounds[1] = end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(output->info()->dimension(1))) : 0;
+    }
+    else
+    {
+        cols_out_of_bounds[0] = start[0] < 0 ? std::min(-start[0], static_cast<int32_t>(output->info()->dimension(1))) : 0;
+        cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(input->info()->dimension(1)) ? std::min(end[0] - input->info()->dimension(1) + 1, output->info()->dimension(1)) : 0;
+    }
+
+    Window full_window = calculate_max_window(*output->info());
+
+    //  Full output window:
+    //  --------------------------------
+    //  |          Out of bounds       |
+    //  |          rows before         |
+    //  |------------------------------|
+    //  | Out of | In         | Out of |
+    //  | bounds | bounds     | bounds |
+    //  | cols   | elements   | cols   |
+    //  | before | copied     | after  |
+    //  |        | from input |        |
+    //  |------------------------------|
+    //  |        Out of bounds         |
+    //  |        rows after            |
+    //  |------------------------------|
+    // Use a separate output window for each section of the full output window.
+    // Fill all output rows that have no elements that are within the input bounds
+    // with the extrapolation value using memset.
+    // First for the rows before the in bounds rows.
+    if(rows_out_of_bounds[0] > 0)
+    {
+        Window slice_fill_rows_before(full_window);
+        slice_fill_rows_before.set(2, Window::Dimension(0, rows_out_of_bounds[0], 1));
+        auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+        kernel->configure(output, extrapolation_value, &slice_fill_rows_before);
+        CLScheduler::get().enqueue(*kernel);
+    }
+
+    Window slice_in(full_window);
+    slice_in.set(2, Window::Dimension(rows_out_of_bounds[0], output->info()->dimension(2) - rows_out_of_bounds[1], 1));
+    slice_in.set(1, Window::Dimension(cols_out_of_bounds[0], output->info()->dimension(1) - cols_out_of_bounds[1], 1));
+
+    int rows_in_bounds = static_cast<int32_t>(output->info()->dimension(2)) - rows_out_of_bounds[0] - rows_out_of_bounds[1];
+    if(rows_in_bounds > 0)
+    {
+        // Fill all elements that share a row with an in bounds element with the extrapolation value.
+        if(cols_out_of_bounds[0] > 0)
+        {
+            Window slice_fill_cols_before(slice_in);
+            slice_fill_cols_before.set(1, Window::Dimension(0, cols_out_of_bounds[0], 1));
+            auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+            kernel->configure(output, extrapolation_value, &slice_fill_cols_before);
+            CLScheduler::get().enqueue(*kernel);
+        }
+
+        if(cols_out_of_bounds[1] > 0)
+        {
+            Window slice_fill_cols_after(slice_in);
+            slice_fill_cols_after.set(1, Window::Dimension(output->info()->dimension(1) - cols_out_of_bounds[1], output->info()->dimension(1), 1));
+            auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+            kernel->configure(output, extrapolation_value, &slice_fill_cols_after);
+            CLScheduler::get().enqueue(*kernel);
+        }
+
+        // Copy all elements within the input bounds from the input tensor.
+        int cols_in_bounds = static_cast<int32_t>(output->info()->dimension(1)) - cols_out_of_bounds[0] - cols_out_of_bounds[1];
+        if(cols_in_bounds > 0)
+        {
+            Coordinates2D start_in{ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0],
+                                    is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0] };
+            Coordinates2D end_in{ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1,
+                                  is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1 };
+            auto kernel = arm_compute::support::cpp14::make_unique<CLCropKernel>();
+
+            kernel->configure(input, output, start_in, end_in, batch_index, extrapolation_value, &slice_in);
+            CLScheduler::get().enqueue(*kernel);
+        }
+    }
+
+    // Fill all rows after the in bounds elements with the extrapolation value.
+    if(rows_out_of_bounds[1] > 0)
+    {
+        Window slice_fill_rows_after(full_window);
+        slice_fill_rows_after.set(2, Window::Dimension(output->info()->dimension(2) - rows_out_of_bounds[1], output->info()->dimension(2), 1));
+        auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+        kernel->configure(output, extrapolation_value, &slice_fill_rows_after);
+        CLScheduler::get().enqueue(*kernel);
+    }
+}
+} // namespace
+
+CLCropResize::CLCropResize()
+    : _input(nullptr), _boxes(nullptr), _box_ind(nullptr), _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _scale(), _copy(), _crop_results(), _scaled_results()
+{
+}
+
+Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITensorInfo *box_ind, const ITensorInfo *output,
+                              Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA);
+    ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[0] != 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]);
+    TensorInfo temp_info;
+    ARM_COMPUTE_RETURN_ON_ERROR(CLCropKernel::validate(input->clone().get(), &temp_info, { 0, 0 }, { 1, 1 }, input->dimension(3) - 1, extrapolation_value));
+    if(output->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+        TensorShape out_shape(input->tensor_shape()[0], crop_size.x, crop_size.y, boxes->tensor_shape()[1]);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), out_shape);
+    }
+    return Status{};
+}
+
+void CLCropResize::configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
+                             InterpolationPolicy method, float extrapolation_value)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value));
+
+    _num_boxes = boxes->info()->tensor_shape()[1];
+    TensorShape out_shape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y);
+
+    _input               = input;
+    _boxes               = boxes;
+    _box_ind             = box_ind;
+    _output              = output;
+    _method              = method;
+    _extrapolation_value = extrapolation_value;
+
+    // For each crop box:
+    // - The initial cropped image is produced as specified by boxes[i] from the 3D image input[box_ind[i]].
+    //   Possibly using a CLCropKernel and up to four CLMemsetKernels.
+    // - A tensor is required to hold this initial cropped image.
+    // - A scale function is used to resize the cropped image to the size specified by crop_size.
+    // - A tensor is required to hold the final scaled image before it is copied into the 4D output
+    //   that will hold all final cropped and scaled 3D images using CLCopyKernel.
+    for(unsigned int i = 0; i < _num_boxes; ++i)
+    {
+        auto       crop_tensor = support::cpp14::make_unique<CLTensor>();
+        TensorInfo crop_result_info(1, DataType::F32);
+        crop_result_info.set_data_layout(DataLayout::NHWC);
+        crop_tensor->allocator()->init(crop_result_info);
+        _crop_results.emplace_back(std::move(crop_tensor));
+
+        auto       scale_tensor = support::cpp14::make_unique<CLTensor>();
+        TensorInfo scaled_result_info(out_shape, 1, DataType::F32);
+        scaled_result_info.set_data_layout(DataLayout::NHWC);
+        scale_tensor->allocator()->init(scaled_result_info);
+        _scaled_results.emplace_back(std::move(scale_tensor));
+    }
+}
+
+void CLCropResize::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
+    // The contents of _boxes and _box_ind are required to calculate the shape
+    // of the initial cropped image and thus are required to configure the
+    // kernels used for cropping and scaling.
+    _boxes->map(CLScheduler::get().queue());
+    _box_ind->map(CLScheduler::get().queue());
+    for(unsigned int i = 0; i < _num_boxes; ++i)
+    {
+        // Size of the crop box in _boxes and thus the shape of _crop_results[i]
+        // may not be known until run-time and so the kernels cannot be configured until then.
+        uint32_t    batch_index;
+        Coordinates start{};
+        Coordinates end{};
+        configure_crop(_input, _boxes, _box_ind, _crop_results[i].get(), i, start, end, batch_index);
+
+        auto scale_kernel = support::cpp14::make_unique<CLScale>();
+        scale_kernel->configure(_crop_results[i].get(), _scaled_results[i].get(), _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT);
+        _scale.emplace_back(std::move(scale_kernel));
+
+        Window win = calculate_max_window(*_output->info());
+        win.set(3, Window::Dimension(i, i + 1, 1));
+
+        auto copy_kernel = support::cpp14::make_unique<CLCopyKernel>();
+        copy_kernel->configure(_scaled_results[i].get(), _output, PaddingList(), &win);
+        _copy.emplace_back(std::move(copy_kernel));
+
+        _crop_results[i]->allocator()->allocate();
+        _scaled_results[i]->allocator()->allocate();
+
+        run_crop(_input, _crop_results[i].get(), batch_index, start, end, _extrapolation_value);
+    }
+    _boxes->unmap(CLScheduler::get().queue());
+    _box_ind->unmap(CLScheduler::get().queue());
+    CLScheduler::get().sync();
+    for(auto &kernel : _scale)
+    {
+        kernel->run();
+    }
+    CLScheduler::get().sync();
+    for(auto &kernel : _copy)
+    {
+        CLScheduler::get().enqueue(*kernel, true);
+    }
+    CLScheduler::get().sync();
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 9da02c1..c6f79d3 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp

@@ -23,188 +23,117 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CPP/CPPScheduler.h"
 
+#include <cmath>
 #include <memory>
 #include <tuple>
 
 using namespace arm_compute;
 using namespace arm_compute::misc::shape_calculator;
 
-CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _scale_f(),
-      _conv_f(),
-      _flip_weights(),
-      _scaled_output(),
-      _original_weights(nullptr),
-      _weights_flipped(),
-      _is_prepared(false)
+CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_manager(std::move(memory_manager)), _function()
 {
 }
 
-Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
+                                     unsigned int inner_border_right, unsigned int inner_border_top, const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_UNUSED(inner_border_right, inner_border_top);
+
+    switch(CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), deconv_info, weights_info))
+    {
+        case DeconvolutionMethod::DIRECT:
+        {
+            auto f = arm_compute::support::cpp14::make_unique<CLDirectDeconvolutionLayer>();
+            f->configure(input, weights, bias, output, deconv_info, weights_info);
+            _function = std::move(f);
+            break;
+        }
+        case DeconvolutionMethod::GEMM:
+        {
+            auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
+            f->configure(input, weights, bias, output, deconv_info);
+            _function = std::move(f);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported.");
+            break;
+    }
+}
+
+Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
                                       unsigned int inner_border_right, unsigned int inner_border_top, const WeightsInfo &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+    ARM_COMPUTE_UNUSED(inner_border_right, inner_border_top);
+
+    switch(CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info))
+    {
+        case DeconvolutionMethod::DIRECT:
+        {
+            // Validate direct convolution layer
+            ARM_COMPUTE_RETURN_ON_ERROR(CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info));
+            break;
+        }
+        case DeconvolutionMethod::GEMM:
+        {
+            // Validate gemm-based convolution layer
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info));
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported.");
+            break;
+    }
+
+    return Status{};
+}
+
+DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
+                                                                   const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_UNUSED(output, bias, weights_info);
 
     const DataLayout data_layout = input->data_layout();
 
     const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
-    ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
-
-    const unsigned int stride_x = info.stride().first;
-    const unsigned int stride_y = info.stride().second;
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_right > stride_x - 1, "inner_border_right must be smaller than stride_x");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_top > stride_y - 1, "inner_border_top must be smaller than stride_y");
-
-    auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h),
-                                                    info.pad().first, info.pad().second, stride_x, stride_y);
-
-    const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
-
-    if(bias != nullptr)
+    if(weights->dimension(idx_w) != deconv_info.stride().first || weights->dimension(idx_h) != deconv_info.stride().second)
     {
-        if(is_data_type_quantized_asymmetric(input->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
+        return DeconvolutionMethod::DIRECT;
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w], "Output's width is invalid.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], "Output's height is invalid.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], "Output's depth is invalid.");
-
-    unsigned int        padx            = 0;
-    unsigned int        pady            = 0;
-    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, inner_border_right, inner_border_top, out_dims, padx, pady);
-    TensorInfo          scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
-    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-
-    ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, BorderSize(inner_border_right, inner_border_top), info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
-
-    return Status{};
+    return DeconvolutionMethod::GEMM;
 }
 
-void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
-                                     unsigned int inner_border_right, unsigned int inner_border_top, const WeightsInfo &weights_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
-    const unsigned int stride_x = info.stride().first;
-    const unsigned int stride_y = info.stride().second;
-
-    const DataLayout data_layout = input->info()->data_layout();
-
-    const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    _original_weights = weights;
-    _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
-    _flip_weights.configure(weights, &_weights_flipped);
-
-    auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h),
-                                                    info.pad().first, info.pad().second, stride_x, stride_y);
-
-    const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, inner_border_right, inner_border_top));
-
-    _is_prepared = weights_info.retain_internal_weights();
-
-    _memory_group.manage(&_scaled_output);
-
-    // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
-    unsigned int      padx            = 0;
-    unsigned int      pady            = 0;
-    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, inner_border_right, inner_border_top, out_dims, padx, pady);
-
-    TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
-    scale_out_info.set_data_layout(data_layout);
-    _scaled_output.allocator()->init(scale_out_info);
-
-    // configure scale function
-    const PadStrideInfo upsample_info(stride_x, stride_y, padx / 2, pady / 2);
-    _scale_f.configure(input, &_scaled_output, BorderSize(inner_border_top, inner_border_right), upsample_info);
-
-    // setup the function to convolve the upscaled output
-    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-    _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
-    _scaled_output.allocator()->allocate();
-}
-
-void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
+void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
                                      const WeightsInfo &weights_info)
 {
-    configure(input, weights, bias, output, info, 0, 0, weights_info);
+    configure(input, weights, bias, output, deconv_info, 0, 0, weights_info);
 }
 
-Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
                                       const WeightsInfo &weights_info)
 {
-    return CLDeconvolutionLayer::validate(input, weights, bias, output, info, 0, 0, weights_info);
+    return CLDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, 0, 0, weights_info);
 }
 
 void CLDeconvolutionLayer::run()
 {
     prepare();
-
-    _memory_group.acquire();
-
-    _scale_f.run();
-    _conv_f.run();
-
-    _memory_group.release();
+    _function->run();
 }
 
 void CLDeconvolutionLayer::prepare()
 {
-    if(!_is_prepared)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        // Run weights flipping and mark original weights tensor as unused
-        _weights_flipped.allocator()->allocate();
-        _weights_flipped.map(true);
-        _original_weights->map(CLScheduler::get().queue(), true);
-        CPPScheduler::get().schedule(&_flip_weights, Window::DimZ);
-        _weights_flipped.unmap();
-        _original_weights->unmap(CLScheduler::get().queue());
-        _original_weights->mark_as_unused();
-
-        // Prepare convolution
-        _conv_f.prepare();
-
-        if(!_weights_flipped.is_used())
-        {
-            _weights_flipped.allocator()->free();
-        }
-
-        _is_prepared = true;
-    }
+    _function->prepare();
 }

diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
index ce8667d..c66dff0 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,14 +27,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-#include <cmath>
-#include <memory>
-#include <tuple>
-
-using namespace arm_compute;
-
+namespace arm_compute
+{
 CLDeconvolutionLayerUpsample::CLDeconvolutionLayerUpsample() // NOLINT
     : _upsample(),
+      _memset(),
       _output(nullptr)
 {
 }
@@ -51,22 +48,13 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     _output = output;
+    _memset.configure(_output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
     _upsample.configure(input, _output, inner_border, info);
 }
 
 void CLDeconvolutionLayerUpsample::run()
 {
-    _output->map(CLScheduler::get().queue(), true);
-    if(is_data_type_quantized_asymmetric(_output->info()->data_type()))
-    {
-        const uint8_t quantized_zero = _output->info()->quantization_info().offset;
-        std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
-    }
-    else
-    {
-        memset(_output->buffer(), 0, _output->info()->total_size());
-    }
-    _output->unmap(CLScheduler::get().queue());
-
-    CLScheduler::get().enqueue(_upsample, false);
+    CLScheduler::get().enqueue(_memset, false);
+    CLScheduler::get().enqueue(_upsample, true);
 }
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
index e46647a..f687e54 100644
--- a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp

@@ -36,8 +36,7 @@
 using namespace arm_compute;
 
 CLDepthConcatenateLayer::CLDepthConcatenateLayer() // NOLINT
-    : _inputs_vector(),
-      _concat_kernels_vector(),
+    : _concat_kernels_vector(),
       _border_handlers_vector(),
       _num_inputs(0)
 {
@@ -53,10 +52,10 @@
         inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
     }
 
-    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<CLDepthConcatenateLayerKernel[]>(_num_inputs);
-    _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
+    _concat_kernels_vector.resize(_num_inputs);
+    _border_handlers_vector.resize(_num_inputs);
 
-    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(inputs_vector_info);
+    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector_info, Window::DimZ);
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
@@ -82,7 +81,7 @@
 
     // Output auto inizialitation if not yet initialized
     TensorInfo  tmp_output_info = *output->clone();
-    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(inputs_vector);
+    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimZ);
     auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
 
     unsigned int depth_offset = 0;

diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 15cbfce..97b0a01 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp

@@ -45,10 +45,18 @@
 }
 
 void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
-                                               ActivationLayerInfo act_info)
+                                               ActivationLayerInfo act_info, const Size2D &dilation)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    // idx_w and idx_h only used for validation
+    const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_UNUSED(idx_w);
+    ARM_COMPUTE_UNUSED(idx_h);
+
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
 
     const bool is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
 
@@ -62,11 +70,13 @@
     const ICLTensor *weights_to_use = weights;
     ICLTensor       *output_to_use  = output;
 
-    const bool                      is_stride_1       = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
-    const bool                      is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+    const bool is_stride_1            = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+    const bool is_dot8_supported      = dot8_supported(CLKernelLibrary::get().get_device());
+    const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
+
     DepthwiseConvolutionReshapeInfo info;
     info.c0        = 4;
-    info.transpose = is_stride_1 && is_dot8_supported;
+    info.transpose = is_stride_1_dilation_1 && is_dot8_supported;
 
     if(_needs_permute)
     {
@@ -103,7 +113,7 @@
 
     // Configure kernel
     _kernel->set_target(CLScheduler::get().target());
-    _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, act_info);
+    _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, act_info, dilation);
 
     // Permute output if needed
     if(_needs_permute)
@@ -126,26 +136,26 @@
 }
 
 Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                unsigned int        depth_multiplier,
-                                                ActivationLayerInfo act_info, GPUTarget gpu_target)
+                                                unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
 
-    const bool                      is_nhwc               = input->data_layout() == DataLayout::NHWC;
-    const bool                      needs_permute         = is_nhwc && (depth_multiplier > 1);
-    const bool                      needs_weights_reshape = is_nhwc && (depth_multiplier == 1);
-    const bool                      is_stride_1           = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
-    const bool                      is_dot8_supported     = dot8_supported(CLKernelLibrary::get().get_device());
+    const bool                      is_nhwc                = input->data_layout() == DataLayout::NHWC;
+    const bool                      needs_permute          = is_nhwc && (depth_multiplier > 1);
+    const bool                      needs_weights_reshape  = is_nhwc && (depth_multiplier == 1);
+    const bool                      is_stride_1            = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+    const bool                      is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
+    const bool                      is_dot8_supported      = dot8_supported(CLKernelLibrary::get().get_device());
     DepthwiseConvolutionReshapeInfo info;
     info.c0        = 4;
-    info.transpose = is_stride_1 && is_dot8_supported;
+    info.transpose = is_stride_1_dilation_1 && is_dot8_supported;
 
     if(needs_permute)
     {
         TensorShape permuted_input_shape   = input->tensor_shape();
         TensorShape permuted_weights_shape = weights->tensor_shape();
-        TensorShape permuted_output_shape  = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+        TensorShape permuted_output_shape  = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
 
         permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
         permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
@@ -155,7 +165,8 @@
         const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW);
         const TensorInfo permuted_output  = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW);
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, act_info, gpu_target));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, act_info, gpu_target,
+                                                                                       dilation));
     }
     else if(is_nhwc)
     {
@@ -163,13 +174,13 @@
         {
             auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*weights, info);
             ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases, output, conv_info, depth_multiplier,
-                                                                                           act_info));
+                                                                                           act_info, dilation));
         }
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation));
     }
 
     return Status{};
@@ -179,7 +190,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     if(_needs_permute)
     {
@@ -192,8 +203,6 @@
     {
         _permute_output_to_nhwc.run();
     }
-
-    _memory_group.release();
 }
 
 void CLDepthwiseConvolutionLayer3x3::prepare()
@@ -229,7 +238,7 @@
 }
 
 void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                            unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+                                            unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
@@ -238,12 +247,15 @@
     const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
 
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+
     const bool can_run_optimised_3x3_kernel = (weights->info()->dimension(idx_w) == 3) && (weights->info()->dimension(idx_h) == 3);
 
     if(bool(can_run_optimised_3x3_kernel))
     {
         auto f = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3>();
-        f->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+        f->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
         _optimised_function = std::move(f);
     }
     else
@@ -262,7 +274,7 @@
         const GPUTarget gpu_target  = CLScheduler::get().target();
 
         // Calculate output shape
-        TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+        TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
 
         // Output auto inizialitation if not yet initialized
         auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
@@ -283,7 +295,7 @@
         shape_im2col.set(2, weights_z);
         _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
         _im2col_kernel.set_target(gpu_target);
-        _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
+        _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation);
         CLScheduler::get().tune_kernel_static(_im2col_kernel);
 
         // Weights reshape configuration
@@ -310,7 +322,8 @@
             const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
 
             float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
-            int   output_multiplier, output_shift;
+            int   output_multiplier;
+            int   output_shift;
             quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
             _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
             _output_reshaped.allocator()->allocate();
@@ -345,11 +358,14 @@
 }
 
 Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                             unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+                                             unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+
     const bool can_run_optimised_3x3_kernel = (weights->dimension(idx_w) == 3) && (weights->dimension(idx_h) == 3);
 
     if(can_run_optimised_3x3_kernel)
@@ -361,7 +377,7 @@
 
         const bool         is_quantized = is_data_type_quantized_asymmetric(input->data_type());
         const bool         append_bias  = (biases != nullptr) && !is_quantized;
-        const TensorShape  output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+        const TensorShape  output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
         const size_t       weights_w    = weights->dimension(idx_w);
         const size_t       weights_h    = weights->dimension(idx_h);
         const size_t       weights_z    = weights->dimension(idx_c);
@@ -375,7 +391,7 @@
         shape_im2col.set(1, conv_size);
         shape_im2col.set(2, weights_z);
         TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation));
 
         const TensorShape shape_weights_reshape(patch_size, weights_z);
         TensorInfo        weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
@@ -405,7 +421,7 @@
     }
     else
     {
-        CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+        CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, GPUTarget::MIDGARD, dilation);
     }
     return Status{};
 }

diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
index 6f33b2e..cdfdfc7 100644
--- a/src/runtime/CL/functions/CLDequantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,36 +21,22 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
 
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h"
+#include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-
-CLDequantizationLayer::CLDequantizationLayer()
-    : _dequantize_kernel()
+namespace arm_compute
 {
+void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLDequantizationLayerKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
 }
 
-Status CLDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status CLDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayerKernel::validate(input, output, min_max));
-
-    return Status{};
+    return CLDequantizationLayerKernel::validate(input, output);
 }
-
-void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *min_max)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
-
-    _dequantize_kernel.configure(input, output, min_max);
-}
-
-void CLDequantizationLayer::run()
-{
-    // Run dequantization kernel
-    CLScheduler::get().enqueue(_dequantize_kernel, false);
-}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
new file mode 100644
index 0000000..6e14e26
--- /dev/null
+++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp

@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <memory>
+#include <tuple>
+
+namespace arm_compute
+{
+using namespace arm_compute::misc::shape_calculator;
+
+CLDirectDeconvolutionLayer::CLDirectDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _scale_f(),
+      _conv_f(),
+      _flip_weights(),
+      _scaled_output(),
+      _original_weights(nullptr),
+      _weights_flipped(),
+      _flip_axis(),
+      _is_prepared(false)
+{
+}
+
+Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+                                            const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+    const DataLayout data_layout = input->data_layout();
+
+    const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
+
+    const unsigned int stride_x = info.stride().first;
+    const unsigned int stride_y = info.stride().second;
+
+    auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h),
+                                                    info.pad().first, info.pad().second, stride_x, stride_y);
+
+    const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
+
+    if(bias != nullptr)
+    {
+        if(is_data_type_quantized_asymmetric(input->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w], "Output's width is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], "Output's height is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], "Output's depth is invalid.");
+
+    unsigned int        padx            = 0;
+    unsigned int        pady            = 0;
+    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, 0, 0, out_dims, padx, pady);
+    TensorInfo          scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
+    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, BorderSize(), info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
+
+    return Status{};
+}
+
+void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
+                                           const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+    const unsigned int stride_x = info.stride().first;
+    const unsigned int stride_y = info.stride().second;
+
+    const DataLayout data_layout = input->info()->data_layout();
+
+    const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    _original_weights = weights;
+    _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
+    _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+    _flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
+
+    auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h),
+                                                    info.pad().first, info.pad().second, stride_x, stride_y);
+
+    const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info));
+
+    _is_prepared = weights_info.retain_internal_weights();
+
+    _memory_group.manage(&_scaled_output);
+
+    // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
+    unsigned int      padx            = 0;
+    unsigned int      pady            = 0;
+    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, 0, 0, out_dims, padx, pady);
+
+    TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+    scale_out_info.set_data_layout(data_layout);
+    _scaled_output.allocator()->init(scale_out_info);
+
+    // configure scale function
+    const PadStrideInfo upsample_info(stride_x, stride_y, padx / 2, pady / 2);
+    _scale_f.configure(input, &_scaled_output, BorderSize(), upsample_info);
+
+    // Setup the function to convolve the upscaled output
+    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+    _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
+    _scaled_output.allocator()->allocate();
+
+    // Setup flip axis data
+    _flip_axis.allocator()->allocate();
+    _flip_axis.map(true);
+    auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
+    if(weights->info()->data_layout() == DataLayout::NHWC)
+    {
+        axis_data[0] = 1;
+        axis_data[1] = 2;
+    }
+    else
+    {
+        axis_data[0] = 0;
+        axis_data[1] = 1;
+    }
+    _flip_axis.unmap();
+}
+
+void CLDirectDeconvolutionLayer::run()
+{
+    prepare();
+
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    _scale_f.run();
+    _conv_f.run();
+}
+
+void CLDirectDeconvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run weights flipping and mark original weights tensor as unused
+        _weights_flipped.allocator()->allocate();
+        _flip_weights.run();
+        _original_weights->mark_as_unused();
+
+        // Prepare convolution
+        _conv_f.prepare();
+
+        // Free flipped weights
+        if(!_weights_flipped.is_used())
+        {
+            _weights_flipped.allocator()->free();
+        }
+
+        _is_prepared = true;
+    }
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp
new file mode 100644
index 0000000..49b5a2a
--- /dev/null
+++ b/src/runtime/CL/functions/CLFFT1D.cpp

@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFFT1D.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/fft.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+CLFFT1D::CLFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _run_scale(false)
+{
+}
+
+void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(CLFFT1D::validate(input->info(), output->info(), config));
+
+    // Decompose size to radix factors
+    const auto         supported_radix   = CLFFTRadixStageKernel::supported_radix();
+    const unsigned int N                 = input->info()->tensor_shape()[config.axis];
+    const auto         decomposed_vector = arm_compute::helpers::fft::decompose_stages(N, supported_radix);
+    ARM_COMPUTE_ERROR_ON(decomposed_vector.empty());
+
+    // Flags
+    _run_scale        = config.direction == FFTDirection::Inverse;
+    const bool is_c2r = input->info()->num_channels() == 2 && output->info()->num_channels() == 1;
+
+    // Configure digit reverse
+    FFTDigitReverseKernelInfo digit_reverse_config;
+    digit_reverse_config.axis      = config.axis;
+    digit_reverse_config.conjugate = config.direction == FFTDirection::Inverse;
+    TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32);
+    _digit_reverse_indices.allocator()->init(digit_reverse_indices_info);
+    _memory_group.manage(&_digit_reversed_input);
+    _digit_reverse_kernel.configure(input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
+
+    // Create and configure FFT kernels
+    unsigned int Nx = 1;
+    _num_ffts       = decomposed_vector.size();
+    _fft_kernels.resize(_num_ffts);
+    for(unsigned int i = 0; i < _num_ffts; ++i)
+    {
+        const unsigned int radix_for_stage = decomposed_vector.at(i);
+
+        FFTRadixStageKernelInfo fft_kernel_info;
+        fft_kernel_info.axis           = config.axis;
+        fft_kernel_info.radix          = radix_for_stage;
+        fft_kernel_info.Nx             = Nx;
+        fft_kernel_info.is_first_stage = (i == 0);
+        _fft_kernels[i].configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+
+        Nx *= radix_for_stage;
+    }
+
+    // Configure scale kernel
+    if(_run_scale)
+    {
+        FFTScaleKernelInfo scale_config;
+        scale_config.scale     = static_cast<float>(N);
+        scale_config.conjugate = config.direction == FFTDirection::Inverse;
+        is_c2r ? _scale_kernel.configure(&_digit_reversed_input, output, scale_config) : _scale_kernel.configure(output, nullptr, scale_config);
+    }
+
+    // Allocate tensors
+    _digit_reversed_input.allocator()->allocate();
+    _digit_reverse_indices.allocator()->allocate();
+
+    // Init digit reverse indices
+    const auto digit_reverse_cpu = arm_compute::helpers::fft::digit_reverse_indices(N, decomposed_vector);
+    _digit_reverse_indices.map(CLScheduler::get().queue(), true);
+    std::copy_n(digit_reverse_cpu.data(), N, reinterpret_cast<unsigned int *>(_digit_reverse_indices.buffer()));
+    _digit_reverse_indices.unmap(CLScheduler::get().queue());
+}
+
+Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, const FFT1DInfo &config)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+
+    // Check if FFT is decomposable
+    const auto         supported_radix   = CLFFTRadixStageKernel::supported_radix();
+    const unsigned int N                 = input->tensor_shape()[config.axis];
+    const auto         decomposed_vector = arm_compute::helpers::fft::decompose_stages(N, supported_radix);
+    ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty());
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+void CLFFT1D::run()
+{
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    // Run digit reverse
+    CLScheduler::get().enqueue(_digit_reverse_kernel, false);
+
+    // Run radix kernels
+    for(unsigned int i = 0; i < _num_ffts; ++i)
+    {
+        CLScheduler::get().enqueue(_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale);
+    }
+
+    // Run output scaling
+    if(_run_scale)
+    {
+        CLScheduler::get().enqueue(_scale_kernel, true);
+    }
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLFFT2D.cpp b/src/runtime/CL/functions/CLFFT2D.cpp
new file mode 100644
index 0000000..165e784
--- /dev/null
+++ b/src/runtime/CL/functions/CLFFT2D.cpp

@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFFT2D.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+CLFFT2D::CLFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor()
+{
+}
+
+void CLFFT2D::configure(const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(CLFFT2D::validate(input->info(), output->info(), config));
+
+    // Setup first pass
+    FFT1DInfo first_pass_config;
+    first_pass_config.axis      = config.axes.first;
+    first_pass_config.direction = config.direction;
+    _memory_group.manage(&_first_pass_tensor);
+    _first_pass_func.configure(input, &_first_pass_tensor, first_pass_config);
+
+    // Setup second pass
+    FFT1DInfo second_pass_config;
+    second_pass_config.axis      = config.axes.second;
+    second_pass_config.direction = config.direction;
+    _second_pass_func.configure(&_first_pass_tensor, output, second_pass_config);
+    _first_pass_tensor.allocator()->allocate();
+}
+
+Status CLFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, const FFT2DInfo &config)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+    // Create intermediate tensor info
+    TensorInfo first_pass_tensor(input->clone()->set_is_resizable(true).reset_padding().set_num_channels(2));
+
+    // Validate first pass
+    FFT1DInfo first_pass_config;
+    first_pass_config.axis      = config.axes.first;
+    first_pass_config.direction = config.direction;
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFFT1D::validate(input, &first_pass_tensor, first_pass_config));
+
+    // Validate second pass
+    FFT1DInfo second_pass_config;
+    second_pass_config.axis      = config.axes.second;
+    second_pass_config.direction = config.direction;
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFFT1D::validate(&first_pass_tensor, output, second_pass_config));
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+void CLFFT2D::run()
+{
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    _first_pass_func.run();
+    _second_pass_func.run();
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
new file mode 100644
index 0000000..afb1cab
--- /dev/null
+++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp

@@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/fft.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CPP/CPPScheduler.h"
+
+namespace arm_compute
+{
+namespace
+{
+int pad_decomposable(int N)
+{
+    const auto supported_radix = CLFFTRadixStageKernel::supported_radix();
+
+    int  pad           = 0;
+    bool is_decomposed = false;
+    while(!is_decomposed)
+    {
+        const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix);
+        is_decomposed                = !decomposed_vector.empty();
+        if(!is_decomposed)
+        {
+            ++pad;
+        }
+    }
+    return pad;
+}
+} // namespace
+CLFFTConvolutionLayer::CLFFTConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager),
+      _flip_weights_func(),
+      _permute_input_func(),
+      _permute_output_func(),
+      _permute_weights_func(),
+      _permute_bias_func(),
+      _pad_input_func(),
+      _pad_weights_func(),
+      _transform_input_func(memory_manager),
+      _transform_weights_func(),
+      _itransform_output_func(memory_manager),
+      _prod_func(),
+      _reduce_func(),
+      _extract_output_func(),
+      _bias_add_func(),
+      _activation_layer_func(),
+      _permuted_input(),
+      _permuted_weights(),
+      _permuted_bias(),
+      _permuted_output(),
+      _padded_input(),
+      _padded_weights(),
+      _flip_axis(),
+      _flipped_weights(),
+      _transformed_input(),
+      _transformed_weights(),
+      _input_weights_product(),
+      _output_product(),
+      _output_reduced(),
+      _itransformed_output(),
+      _reshaped_output(),
+      _bias_output(),
+      _original_weights(nullptr),
+      _original_bias(nullptr),
+      _is_activationlayer_enabled(false),
+      _needs_permute(false),
+      _has_bias(false),
+      _is_prepared(false)
+{
+}
+
+void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+                                      const ActivationLayerInfo &act_info)
+{
+    _original_weights = weights;
+    _original_bias    = biases;
+
+    // Flat if bias addition is required
+    _has_bias = biases != nullptr;
+
+    // Get indices for the width and height
+    const size_t idx_width  = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+
+    // Input shape, kernel size and output tile
+    const Size2D input_dims  = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+    const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+    const Size2D pad_valid   = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
+                                      pad_decomposable(input_dims.y() + kernel_size.y() - 1));
+    // Tensors to use
+    ICLTensor       *input_to_use   = input;
+    const ICLTensor *weights_to_use = weights;
+    ICLTensor       *output_to_use  = _has_bias ? &_bias_output : output;
+
+    // Permute bias
+    if(biases != nullptr)
+    {
+        _permute_bias_func.configure(biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
+        _permuted_bias.info()->set_data_layout(DataLayout::NCHW);
+    }
+
+    // Permute input if needed
+    _needs_permute = input->info()->data_layout() == DataLayout::NHWC;
+    if(_needs_permute)
+    {
+        _memory_group.manage(&_permuted_input);
+        // Configure the function to transform the input tensor from NHWC -> NCHW
+        _permute_input_func.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+        _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+        // Configure the function to transform the weights tensor from HWI -> IHW
+        _permute_weights_func.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+        _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+
+        input_to_use   = &_permuted_input;
+        weights_to_use = &_permuted_weights;
+    }
+
+    // Flip weights
+    _flipped_weights.allocator()->init(weights_to_use->info()->clone()->set_is_resizable(true).reset_padding());
+    _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
+    _flip_weights_func.configure(weights_to_use, &_flipped_weights, &_flip_axis);
+
+    // Pad weights
+    const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } };
+    _pad_weights_func.configure(&_flipped_weights, &_padded_weights, padding_w);
+
+    // Transform weights
+    _transform_weights_func = support::cpp14::make_unique<CLFFT2D>();
+    _transform_weights_func->configure(&_padded_weights, &_transformed_weights, FFT2DInfo());
+
+    // Pad input
+    const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } };
+    _memory_group.manage(&_padded_input);
+    _pad_input_func.configure(input_to_use, &_padded_input, padding_in);
+    if(_needs_permute)
+    {
+        _permuted_input.allocator()->allocate();
+    }
+
+    // Transform input
+    _memory_group.manage(&_transformed_input);
+    _transform_input_func.configure(&_padded_input, &_transformed_input, FFT2DInfo());
+    _padded_input.allocator()->allocate();
+
+    // Perform product
+    _memory_group.manage(&_output_product);
+    _prod_func.configure(&_transformed_input, &_transformed_weights, &_output_product);
+    _transformed_input.allocator()->allocate();
+
+    // Perform reduction
+    _memory_group.manage(&_output_reduced);
+    _reduce_func.configure(&_output_product, &_output_reduced, 2, ReductionOperation::SUM);
+    _output_product.allocator()->allocate();
+
+    // Transform output
+    _memory_group.manage(&_itransformed_output);
+    FFT2DInfo itranform_info;
+    itranform_info.direction = FFTDirection::Inverse;
+    _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
+    _itransform_output_func.configure(&_output_reduced, &_itransformed_output, itranform_info);
+    _output_reduced.allocator()->allocate();
+
+    // Reshape output
+    TensorShape reshaped_shape = _itransformed_output.info()->tensor_shape();
+    reshaped_shape.remove_dimension(2);
+    _reshaped_output.allocator()->init(_itransformed_output.info()->clone()->set_tensor_shape(reshaped_shape));
+
+    // Extract correct region
+    const int start_left = kernel_size.x() - conv_info.pad_left() - 1;
+    const int start_top  = kernel_size.y() - conv_info.pad_top() - 1;
+    const int end_right  = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
+    const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
+    if(_has_bias)
+    {
+        _memory_group.manage(&_bias_output);
+    }
+    else if(_needs_permute)
+    {
+        output_to_use = &_permuted_output;
+        _memory_group.manage(&_permuted_output);
+    }
+    _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
+    _itransformed_output.allocator()->allocate();
+
+    // Add bias
+    if(biases != nullptr)
+    {
+        output_to_use = output;
+        if(_needs_permute)
+        {
+            output_to_use = &_permuted_output;
+            _memory_group.manage(&_permuted_output);
+        }
+        auto_init_if_empty(*output_to_use->info(), *_bias_output.info());
+        _bias_add_func.configure(&_bias_output, &_permuted_bias, output_to_use, ConvertPolicy::WRAP);
+        _bias_output.allocator()->allocate();
+    }
+
+    // Permute output
+    if(_needs_permute)
+    {
+        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+        _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+        _permute_output_func.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+
+        // Allocate tensors
+        _permuted_output.allocator()->allocate();
+    }
+
+    // Configure Activation Layer
+    _is_activationlayer_enabled = act_info.enabled();
+    if(_is_activationlayer_enabled)
+    {
+        _activation_layer_func.configure(output, nullptr, act_info);
+    }
+
+    // Setup flip axis data
+    _flip_axis.allocator()->allocate();
+    _flip_axis.map(true);
+    auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
+    axis_data[0]   = 0;
+    axis_data[1]   = 1;
+    _flip_axis.unmap();
+}
+
+Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                       const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+
+    // Get indices for the width and height
+    const size_t idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+    // Input shape, kernel size and output tile
+    const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
+
+    // Strides
+    const auto strides = conv_info.stride();
+    ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y());
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2));
+
+    // Validate biases
+    if(biases != nullptr)
+    {
+        const size_t idx_channels = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channels] != biases->tensor_shape().x());
+    }
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
+
+        // Validate Activation Layer
+        if(act_info.enabled())
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+        }
+    }
+
+    return Status{};
+}
+
+void CLFFTConvolutionLayer::run()
+{
+    prepare();
+
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    // Transform input
+    if(_needs_permute)
+    {
+        _permute_input_func.run();
+    }
+    _pad_input_func.run();
+    _transform_input_func.run();
+
+    // Perform operations to frequency domain
+    _prod_func.run();
+    _reduce_func.run();
+
+    // Transform output
+    _itransform_output_func.run();
+    _reshaped_output.allocator()->import_memory(_itransformed_output.cl_buffer());
+    _extract_output_func.run();
+    // Add bias
+    if(_has_bias)
+    {
+        _bias_add_func.run();
+    }
+    if(_needs_permute)
+    {
+        _permute_output_func.run();
+    }
+
+    // Run activation layer
+    if(_is_activationlayer_enabled)
+    {
+        _activation_layer_func.run();
+    }
+}
+
+void CLFFTConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        // Permute bias to NCHW
+        if(_original_bias != nullptr)
+        {
+            _permuted_bias.allocator()->allocate();
+            _permute_bias_func.run();
+            _original_bias->mark_as_unused();
+        }
+
+        const ICLTensor *cur_weights = _original_weights;
+        // Permute weights
+        if(_needs_permute)
+        {
+            ARM_COMPUTE_ERROR_ON(!cur_weights->is_used());
+
+            _permuted_weights.allocator()->allocate();
+            _permute_weights_func.run();
+            cur_weights->mark_as_unused();
+            cur_weights = &_permuted_weights;
+        }
+
+        // Flip weights
+        _flipped_weights.allocator()->allocate();
+        _flip_weights_func.run();
+        cur_weights->mark_as_unused();
+
+        // Pad weights
+        _padded_weights.allocator()->allocate();
+        _pad_weights_func.run();
+        _flipped_weights.mark_as_unused();
+        CLScheduler::get().queue().finish();
+        _flipped_weights.allocator()->free();
+
+        // Transform weights to frequency domain
+        _transformed_weights.allocator()->allocate();
+        _transform_weights_func->run();
+        _padded_weights.mark_as_unused();
+        CLScheduler::get().queue().finish();
+        // Delete object and release internal memory
+        _transform_weights_func.reset();
+        _padded_weights.allocator()->free();
+
+        _is_prepared = true;
+    }
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp
index d6cda91..fe2a18c 100644
--- a/src/runtime/CL/functions/CLFastCorners.cpp
+++ b/src/runtime/CL/functions/CLFastCorners.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -97,7 +97,7 @@
 {
     cl::CommandQueue q = CLScheduler::get().queue();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     if(_non_max)
     {
@@ -129,6 +129,4 @@
     }
 
     q.flush();
-
-    _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 6a2aac6..7b9229c 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -231,7 +231,8 @@
     if(_is_quantized)
     {
         float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output->info()->quantization_info().scale;
-        int   output_multiplier, output_shift;
+        int   output_multiplier;
+        int   output_shift;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
         _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset);
         _gemmlowp_output.allocator()->allocate();
@@ -333,7 +334,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Linearize input if it comes from a convolutional layer
     if(_is_fc_after_conv)
@@ -363,8 +364,6 @@
             CLScheduler::get().enqueue(_accumulate_biases_kernel);
         }
     }
-
-    _memory_group.release();
 }
 
 void CLFullyConnectedLayer::prepare()

diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index e91038f..492709f 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp

@@ -23,7 +23,10 @@
  */
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 
+#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h"
+#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/Helpers.h"
@@ -33,7 +36,6 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 
 namespace arm_compute
@@ -41,46 +43,6 @@
 using namespace arm_compute::misc::shape_calculator;
 using namespace arm_compute::cl_gemm;
 
-namespace
-{
-inline bool is_interleaved_transposed(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
-{
-    bool flag = true;
-
-    if(gpu_target_is_in(gpu_target, GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
-    {
-        if((m > 1) && n < 16)
-        {
-            flag = true;
-        }
-        else
-        {
-            // COMPMID-852
-            if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
-            {
-                constexpr float alpha = 3.2f;
-                constexpr float fact0 = 1.51f;
-                constexpr float fact1 = 1.66f;
-                constexpr float ops   = 12.0f;
-                const float     scale = k > 1024 ? 1.07f : 1.0f;
-                flag                  = alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops);
-            }
-            else
-            {
-                flag = false;
-            }
-        }
-    }
-    else
-    {
-        // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once
-        flag = m != 1 && reshape_b_only_on_first_run;
-    }
-
-    return flag;
-}
-} // namespace
-
 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
       _mm_kernel(),
@@ -88,57 +50,102 @@
       _reshape_lhs_kernel(),
       _reshape_rhs_kernel(),
       _mm_reshaped_kernel(),
+      _mm_reshaped_only_rhs_kernel(),
       _tmp_a(),
       _tmp_b(),
       _original_b(nullptr),
-      _is_interleaved_transposed(false),
       _run_addition(false),
       _reshape_b_only_on_first_run(false),
       _is_prepared(false),
-      _is_new_gemm_reshaped(false)
+      _gemm_type(GEMMType::NATIVE)
 {
 }
 
-void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+CLGEMM::GEMMType CLGEMM::select_gemm_type(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+    GEMMType gemm_type = GEMMType::RESHAPED_V1;
 
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info));
+    if(gpu_target_is_in(gpu_target, GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
+    {
+        if((m > 1) && (n < 16))
+        {
+            gemm_type = GEMMType::RESHAPED_V1;
+        }
+        else if((m == 1) && (data_type == DataType::F32))
+        {
+            gemm_type = GEMMType::RESHAPED_ONLY_RHS;
+        }
+        else
+        {
+            // COMPMID-852
+            if((k > 256) && (m > 4) && is_data_type_float(data_type) && reshape_b_only_on_first_run)
+            {
+                constexpr float alpha = 3.2f;
+                constexpr float fact0 = 1.51f;
+                constexpr float fact1 = 1.66f;
+                constexpr float ops   = 12.0f;
+                const float     scale = k > 1024 ? 1.07f : 1.0f;
+                gemm_type             = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? GEMMType::RESHAPED_V1 : GEMMType::NATIVE;
+            }
+            else
+            {
+                gemm_type = GEMMType::NATIVE;
+            }
+        }
 
-    // Check if we need to reshape the matrix B only on the first run
-    _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
-    _is_prepared                 = gemm_info.retain_internal_weights();
-    _original_b                  = b;
+        const auto workload = static_cast<float>((m * n) / 20.0f);
 
-    const ICLTensor *matrix_a = a;
-    const ICLTensor *matrix_b = b;
+        gemm_type = ((workload > 1600.0f) && (gemm_type == GEMMType::RESHAPED_V1) && (data_type == DataType::F32)) ? GEMMType::RESHAPED_V2 : gemm_type;
+    }
+    else
+    {
+        // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once
+        gemm_type = ((m != 1) && reshape_b_only_on_first_run) ? GEMMType::RESHAPED_V1 : GEMMType::NATIVE;
+    }
 
-    // Get the GPU target
-    const GPUTarget gpu_target = CLScheduler::get().target();
+    return gemm_type;
+}
+
+void CLGEMM::configure_native(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    const unsigned int m          = gemm_info.reinterpret_input_as_3d() ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+    const unsigned int n          = b->info()->dimension(0);
+    const unsigned int k          = a->info()->dimension(0);
+    const GPUTarget    gpu_target = CLScheduler::get().target();
 
     // Set the target for the kernels
-    _reshape_lhs_kernel.set_target(gpu_target);
     _mm_kernel.set_target(gpu_target);
 
-    // Arguments used by GEMMReshapeInfo
-    // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
-    // in order to know how the matrices have been reshaped
-    DataType           data_type                 = a->info()->data_type();
+    GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d());
+
+    // Configure and tune matrix multiply kernel
+    _mm_kernel.configure(a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision());
+
+    // Tune kernel statically
+    CLScheduler::get().tune_kernel_static(_mm_kernel);
+}
+
+void CLGEMM::configure_reshaped_v1(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
     bool               reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
     const unsigned int m                         = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
     const unsigned int n                         = b->info()->dimension(0);
     const unsigned int k                         = a->info()->dimension(0);
-    const unsigned int batch_size                = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
     const int          depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target                = CLScheduler::get().target();
     int                mult_transpose1xW_width   = 1;
     int                mult_interleave4x4_height = 1;
 
+    // Set the target for the kernels
+    _reshape_lhs_kernel.set_target(gpu_target);
+    _mm_kernel.set_target(gpu_target);
+
     if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
     {
         mult_transpose1xW_width   = 4;
         mult_interleave4x4_height = 2;
     }
+
     GEMMRHSMatrixInfo rhs_info;
     rhs_info.n0         = 16 / b->info()->element_size();
     rhs_info.k0         = 1;
@@ -153,112 +160,183 @@
     lhs_info.interleave = true;
     lhs_info.transpose  = true;
 
-    // Check if we need to reshape the matrix A and matrix B
-    _is_interleaved_transposed = is_interleaved_transposed(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
+    GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false);
 
-    // Check if we can run the new reshaped GEMM
-    const auto workload   = static_cast<float>((m * n) / 20.0f);
-    _is_new_gemm_reshaped = (workload > 1600.0f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && _is_interleaved_transposed && (data_type == DataType::F32);
-
-    const bool add_matrix_c  = (beta != 0.f && c != nullptr);
-    const bool is_beta_one   = std::abs(1.0f - beta) < 0.00001f;
-    const bool use_fused_add = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !_is_new_gemm_reshaped;
-
-    // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
-    if(_is_interleaved_transposed)
+    _memory_group.manage(&_tmp_a);
+    if(!_reshape_b_only_on_first_run)
     {
-        reinterpret_input_as_3d = false;
-
-        matrix_a = &_tmp_a;
-        matrix_b = &_tmp_b;
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_tmp_a);
-        if(!_reshape_b_only_on_first_run)
-        {
-            _memory_group.manage(&_tmp_b);
-        }
-        // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
-
-        if(_is_new_gemm_reshaped)
-        {
-            GEMMLHSMatrixInfo lhs_info;
-
-            // Pick up the GEMM configuration
-            std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
-
-            _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
-            _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
-
-            // Configure and tune matrix multiply kernel
-            _mm_reshaped_kernel.configure(matrix_a, matrix_b, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
-                                                                                                                 depth_output_gemm3d, reinterpret_input_as_3d));
-        }
-        else
-        {
-            // Configure interleave kernel
-            _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
-            // Configure transpose kernel
-            _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
-        }
+        _memory_group.manage(&_tmp_b);
     }
 
-    if(!_is_new_gemm_reshaped)
-    {
-        // Configure and tune matrix multiply kernel
-        _mm_kernel.configure(matrix_a, matrix_b, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta, _is_interleaved_transposed,
-                             GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d),
-                             gemm_info.fp_mixed_precision());
-        CLScheduler::get().tune_kernel_static(_mm_kernel);
-    }
+    // Configure interleave kernel
+    _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
 
-    if(_is_interleaved_transposed)
-    {
-        // Allocate intermediate tensors
-        _tmp_a.allocator()->allocate();
-        if(!_reshape_b_only_on_first_run)
-        {
-            _tmp_b.allocator()->allocate();
-        }
-    }
+    // Configure transpose kernel
+    _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
 
-    // Configure matrix addition kernel
-    if(add_matrix_c && !use_fused_add)
+    // Configure and tune matrix multiply kernel
+    _mm_kernel.configure(&_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision());
+
+    CLScheduler::get().tune_kernel_static(_mm_kernel);
+
+    // Allocate intermediate tensors
+    _tmp_a.allocator()->allocate();
+    if(!_reshape_b_only_on_first_run)
     {
-        _ma_kernel.configure(c, output, beta);
-        _run_addition = true;
+        _tmp_b.allocator()->allocate();
     }
 }
 
-Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure_reshaped_v2(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON(c != nullptr);
+    ARM_COMPUTE_UNUSED(beta);
+    ARM_COMPUTE_UNUSED(c);
+
+    DataType           data_type               = a->info()->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+    const unsigned int n                       = b->info()->dimension(0);
+    const unsigned int k                       = a->info()->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+
+    // Set the target for the kernels
+    _reshape_lhs_kernel.set_target(gpu_target);
+    _mm_kernel.set_target(gpu_target);
+
+    GEMMReshapeInfo reshape_info(m, n, k, 1, 1, depth_output_gemm3d, false);
+
+    // Manage intermediate buffers
+    _memory_group.manage(&_tmp_a);
+    if(!_reshape_b_only_on_first_run)
+    {
+        _memory_group.manage(&_tmp_b);
+    }
+    // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
+
+    GEMMLHSMatrixInfo lhs_info{};
+    GEMMRHSMatrixInfo rhs_info{};
+
+    // Pick up the GEMM configuration
+    std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedKernelConfigurationFactory::create(gpu_target);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
+
+    // Configure lhs_info and rhs_info
+    std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+    _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+    _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+
+    // Configure and tune matrix multiply kernel
+    _mm_reshaped_kernel.configure(&_tmp_a, &_tmp_b, output, alpha, lhs_info, rhs_info, reshape_info);
+
+    // Allocate intermediate tensors
+    _tmp_a.allocator()->allocate();
+    if(!_reshape_b_only_on_first_run)
+    {
+        _tmp_b.allocator()->allocate();
+    }
+}
+
+void CLGEMM::configure_reshaped_only_rhs(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON(c != nullptr);
+    ARM_COMPUTE_UNUSED(beta);
+    ARM_COMPUTE_UNUSED(c);
+
+    DataType           data_type               = a->info()->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+    const unsigned int n                       = b->info()->dimension(0);
+    const unsigned int k                       = a->info()->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+
+    // Set the target for the kernels
+    _mm_kernel.set_target(gpu_target);
+
+    GEMMReshapeInfo reshape_info(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+    // Manage intermediate buffers
+    if(!_reshape_b_only_on_first_run)
+    {
+        _memory_group.manage(&_tmp_b);
+    }
+
+    GEMMLHSMatrixInfo lhs_info{};
+    GEMMRHSMatrixInfo rhs_info{};
+
+    // Pick up the GEMM configuration
+    std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
+
+    // Configure lhs_info and rhs_info
+    std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+    _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+
+    // Configure and tune matrix multiply kernel
+    _mm_reshaped_only_rhs_kernel.configure(a, &_tmp_b, output, alpha, lhs_info, rhs_info, reshape_info);
+
+    if(!_reshape_b_only_on_first_run)
+    {
+        _tmp_b.allocator()->allocate();
+    }
+}
+
+Status CLGEMM::validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_UNUSED(output);
 
-    // Check if we need to reshape the matrix B only on the first run
-    const bool reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n                       = b->dimension(0);
+    const unsigned int k                       = a->dimension(0);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const bool         add_c                   = (beta != 0.f && c != nullptr);
+    const bool         is_beta_one             = std::abs(1.0f - beta) < 0.00001f;
+    const bool         fuse_add                = is_beta_one && (c != nullptr && c->num_dimensions() == 1);
 
-    const ITensorInfo *matrix_a_info = a;
-    const ITensorInfo *matrix_b_info = b;
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+    // Validate matrix multiply
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta,
+                                                                     false, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
+
+    if(add_c && !fuse_add)
+    {
+        // Validate matrix addition kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
+    }
+
+    return Status{};
+}
+
+Status CLGEMM::validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(output);
 
     TensorInfo tmp_a_info{};
     TensorInfo tmp_b_info{};
 
     // Get the GPU target
-    const GPUTarget gpu_target = CLScheduler::get().target();
-
-    // Arguments used by GEMMReshapeInfo
-    // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
-    // in order to know how the matrices have been reshaped
-    DataType           data_type                 = a->data_type();
-    bool               reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                         = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const GPUTarget    gpu_target                = CLScheduler::get().target();
+    const unsigned int m                         = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
     const unsigned int n                         = b->dimension(0);
     const unsigned int k                         = a->dimension(0);
-    const unsigned int batch_size                = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
     int                mult_transpose1xW_width   = 1;
     int                mult_interleave4x4_height = 1;
     const int          depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
+    const bool         add_c                     = (beta != 0.f && c != nullptr);
+    const bool         is_beta_one               = std::abs(1.0f - beta) < 0.00001f;
+    const bool         fuse_add                  = is_beta_one && (c != nullptr && c->num_dimensions() == 1);
 
     if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
     {
@@ -280,66 +358,21 @@
     lhs_info.interleave = true;
     lhs_info.transpose  = true;
 
-    // Check if we need to reshape the matrix A and matrix B
-    const bool run_interleave_transpose = is_interleaved_transposed(m, n, k, a->data_type(), reshape_b_only_on_first_run, gpu_target);
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false);
 
-    // Check if we can run the new reshaped GEMM
-    const auto workload             = static_cast<float>((m * n) / 20.0f);
-    const bool is_new_gemm_reshaped = (workload > 1600.f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && run_interleave_transpose && (data_type == DataType::F32);
+    // Validate interleave kernel
+    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
 
-    const bool add_matrix_c  = (beta != 0.f && c != nullptr);
-    const bool is_beta_one   = std::abs(1.0f - beta) < 0.00001f;
-    const bool use_fused_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1) && !is_new_gemm_reshaped;
+    // Validate transpose kernel
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
 
-    // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
-    if(run_interleave_transpose)
-    {
-        reinterpret_input_as_3d = false;
-    }
+    // Validate matrix multiply
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, (add_c && fuse_add) ? c : nullptr, output, alpha, beta,
+                                                                     true, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
 
-    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d);
-
-    if(run_interleave_transpose)
-    {
-        matrix_a_info = &tmp_a_info;
-        matrix_b_info = &tmp_b_info;
-
-        if(is_new_gemm_reshaped)
-        {
-            GEMMLHSMatrixInfo lhs_info;
-
-            // Pick up the GEMM configuration
-            std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
-
-            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
-
-            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
-
-            // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
-                                                                                     depth_output_gemm3d, reinterpret_input_as_3d)));
-        }
-        else
-        {
-            // Validate interleave kernel
-            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
-            // Validate transpose kernel
-            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
-        }
-    }
-
-    if(!is_new_gemm_reshaped)
-    {
-        // Validate matrix multiply
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta,
-                                                                         run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
-    }
-
-    if(add_matrix_c && !use_fused_add)
+    if(add_c && !fuse_add)
     {
         // Validate matrix addition kernel
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
@@ -348,32 +381,263 @@
     return Status{};
 }
 
+Status CLGEMM::validate_reshaped_v2(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(output);
+
+    TensorInfo tmp_a_info{};
+    TensorInfo tmp_b_info{};
+
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    DataType           data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n                       = b->dimension(0);
+    const unsigned int k                       = a->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const bool         add_c                   = (beta != 0.f && c != nullptr);
+
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, false);
+
+    GEMMLHSMatrixInfo lhs_info;
+    GEMMRHSMatrixInfo rhs_info;
+
+    // Pick up the GEMM configuration
+    std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedKernelConfigurationFactory::create(gpu_target);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(gemm_config.get());
+
+    // Configure lhs_info and rhs_info
+    std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+    // Validate matrix multiply
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, output, alpha, lhs_info, rhs_info, reshape_info));
+
+    if(add_c)
+    {
+        // Validate matrix addition kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
+    }
+
+    return Status{};
+}
+
+Status CLGEMM::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(output);
+
+    TensorInfo tmp_b_info{};
+
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    const DataType     data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n                       = b->dimension(0);
+    const unsigned int k                       = a->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const bool         add_c                   = (beta != 0.f && c != nullptr);
+
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+    GEMMLHSMatrixInfo lhs_info;
+    GEMMRHSMatrixInfo rhs_info;
+
+    // Pick up the GEMM configuration
+    std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(gemm_config.get());
+
+    // Configure lhs_info and rhs_info
+    std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+    // Validate matrix multiply
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, output, alpha, lhs_info, rhs_info, reshape_info));
+
+    if(add_c)
+    {
+        // Validate matrix addition kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
+    }
+
+    return Status{};
+}
+
+void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info));
+
+    // Check if we need to reshape the matrix B only on the first run
+    _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+    _is_prepared                 = gemm_info.retain_internal_weights();
+    _original_b                  = b;
+
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+    const unsigned int n                       = b->info()->dimension(0);
+    const unsigned int k                       = a->info()->dimension(0);
+
+    // Select GEMMType
+    _gemm_type = select_gemm_type(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
+
+    const bool is_gemm_v2  = (_gemm_type == GEMMType::RESHAPED_V2) || (_gemm_type == GEMMType::RESHAPED_ONLY_RHS);
+    const bool add_c       = (beta != 0.f && c != nullptr);
+    const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
+    const bool fuse_add    = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !is_gemm_v2;
+
+    switch(_gemm_type)
+    {
+        case GEMMType::NATIVE:
+        {
+            configure_native(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+            break;
+        }
+        case GEMMType::RESHAPED_V1:
+        {
+            configure_reshaped_v1(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+            break;
+        }
+        case GEMMType::RESHAPED_V2:
+        {
+            configure_reshaped_v2(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+            break;
+        }
+        case GEMMType::RESHAPED_ONLY_RHS:
+        {
+            configure_reshaped_only_rhs(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("GEMMType not supported");
+        }
+    }
+
+    // Configure matrix addition kernel
+    if(add_c && !fuse_add)
+    {
+        _ma_kernel.configure(c, output, beta);
+        _run_addition = true;
+    }
+}
+
+Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n                       = b->dimension(0);
+    const unsigned int k                       = a->dimension(0);
+
+    // Select GEMMType
+    GEMMType gemm_type = select_gemm_type(m, n, k, a->data_type(), gemm_info.reshape_b_only_on_first_run(), gpu_target);
+
+    switch(gemm_type)
+    {
+        case GEMMType::NATIVE:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_native(a, b, c, output, alpha, beta, gemm_info));
+            break;
+        }
+        case GEMMType::RESHAPED_V1:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v1(a, b, c, output, alpha, beta, gemm_info));
+            break;
+        }
+        case GEMMType::RESHAPED_V2:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v2(a, b, c, output, alpha, beta, gemm_info));
+            break;
+        }
+        case GEMMType::RESHAPED_ONLY_RHS:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c, output, alpha, beta, gemm_info));
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_RETURN_ERROR_MSG("GEMMType not supported");
+        }
+    }
+
+    return Status{};
+}
+
 void CLGEMM::run()
 {
     prepare();
 
-    _memory_group.acquire();
-
-    if(_is_interleaved_transposed)
-    {
-        // Run interleave kernel
-        CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
-
-        if(!_reshape_b_only_on_first_run)
-        {
-            // Run transpose kernel
-            CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
-        }
-    }
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run matrix multiply kernel
-    if(_is_new_gemm_reshaped)
+    switch(_gemm_type)
     {
-        CLScheduler::get().enqueue(_mm_reshaped_kernel, !_run_addition);
-    }
-    else
-    {
-        CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+        case GEMMType::NATIVE:
+        {
+            CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+            break;
+        }
+        case GEMMType::RESHAPED_V1:
+        {
+            // Run interleave kernel
+            CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
+
+            if(!_reshape_b_only_on_first_run)
+            {
+                // Run transpose kernel
+                CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+            }
+
+            CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+            break;
+        }
+        case GEMMType::RESHAPED_V2:
+        {
+            // Run interleave kernel
+            CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
+
+            if(!_reshape_b_only_on_first_run)
+            {
+                // Run transpose kernel
+                CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+            }
+
+            CLScheduler::get().enqueue(_mm_reshaped_kernel, !_run_addition);
+            break;
+        }
+        case GEMMType::RESHAPED_ONLY_RHS:
+        {
+            if(!_reshape_b_only_on_first_run)
+            {
+                // Run transpose kernel
+                CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+            }
+
+            CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, !_run_addition);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("GEMMType not supported");
+        }
     }
 
     // Run matrix addition kernel
@@ -381,15 +645,13 @@
     {
         CLScheduler::get().enqueue(_ma_kernel);
     }
-
-    _memory_group.release();
 }
 
 void CLGEMM::prepare()
 {
     if(!_is_prepared)
     {
-        if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+        if(_gemm_type != GEMMType::NATIVE && _reshape_b_only_on_first_run)
         {
             // Run transpose kernel and mark original weights tensor as unused
             _tmp_b.allocator()->allocate();

diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 7105e85..03d516f 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp

@@ -262,7 +262,7 @@
         shape_gemm.set(0, mat_weights_cols);
         shape_gemm.set(1, conv_w * conv_h);
 
-        // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
+        // TODO(COMPMID-2078): input->clone() doesn't work with subtensors for grouped convolutions.
         TensorInfo info_gemm(shape_gemm, 1, data_type);
         info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout());
         _gemm_output.allocator()->init(info_gemm);
@@ -372,7 +372,9 @@
     const unsigned int kernel_width  = weights->dimension(idx_width);
     const unsigned int kernel_height = weights->dimension(idx_height);
 
-    TensorInfo         im2col_reshaped_info, info_gemm, weights_reshaped_info;
+    TensorInfo         im2col_reshaped_info{};
+    TensorInfo         info_gemm{};
+    TensorInfo         weights_reshaped_info{};
     const ITensorInfo *gemm_input_to_use  = input;
     const ITensorInfo *gemm_output_to_use = output;
     const ITensorInfo *weights_to_use     = weights;
@@ -526,7 +528,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run im2col
     if(!_skip_im2col)
@@ -562,8 +564,6 @@
     {
         _activationlayer_function.run();
     }
-
-    _memory_group.release();
 }
 
 void CLGEMMConvolutionLayer::prepare()

diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
new file mode 100644
index 0000000..bcb91e0
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp

@@ -0,0 +1,373 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "utils/TypePrinter.h"
+
+#include <memory>
+#include <tuple>
+
+namespace arm_compute
+{
+namespace
+{
+std::pair<Coordinates, Coordinates> compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw)
+{
+    Coordinates start;
+    Coordinates end;
+
+    if(is_nchw)
+    {
+        start.set(0, deconv_info.pad_left());
+        start.set(1, deconv_info.pad_top());
+        end.set(0, output_info.dimension(0) - deconv_info.pad_right());
+        end.set(1, output_info.dimension(1) - deconv_info.pad_bottom());
+    }
+    else
+    {
+        start.set(0, 0);
+        start.set(1, deconv_info.pad_left());
+        start.set(2, deconv_info.pad_top());
+
+        end.set(0, output_info.dimension(0));
+        end.set(1, output_info.dimension(1) - deconv_info.pad_right());
+        end.set(2, output_info.dimension(2) - deconv_info.pad_bottom());
+    }
+
+    return { start, end };
+}
+} // namespace
+
+CLGEMMDeconvolutionLayer::CLGEMMDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _mm_gemm(),
+      _mm_gemmlowp(),
+      _gemmlowp_output_stage(),
+      _permute_input_to_nhwc(),
+      _permute_weights_to_nhwc(),
+      _reshape_weights(),
+      _transpose_weights(),
+      _deconv_reshape(),
+      _slice_gemm(),
+      _gemmlowp_final(),
+      _reshaped_weights(),
+      _reshaped_weights_t(),
+      _permuted_input(),
+      _permuted_weights(),
+      _gemm_output(),
+      _slice_gemm_input(),
+      _original_weights(),
+      _is_prepared(false),
+      _padded_input(false),
+      _is_nchw(false),
+      _is_quantized(false)
+{
+}
+
+Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &deconv_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+    DataLayout data_layout  = input->data_layout();
+    const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
+    const bool is_nchw      = input->data_layout() == DataLayout::NCHW;
+    const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+
+    const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const size_t idx_b = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != deconv_info.stride().first);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) != deconv_info.stride().second);
+
+    TensorShape nhwc_weights_shape = weights->tensor_shape();
+    TensorShape nhwc_input_shape   = input->tensor_shape();
+
+    if(is_nchw)
+    {
+        permute(nhwc_weights_shape, PermutationVector(2, 0, 1));
+        permute(nhwc_input_shape, PermutationVector(2, 0, 1));
+
+        TensorInfo nhwc_input_info = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_input_shape).set_data_layout(DataLayout::NCHW);
+
+        TensorInfo nhwc_weights_info = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_weights_shape).set_data_layout(DataLayout::NCHW);
+
+        CLPermute::validate(weights, &nhwc_weights_info, PermutationVector(2, 0, 1));
+        CLPermute::validate(input, &nhwc_input_info, PermutationVector(2, 0, 1));
+    }
+
+    const TensorShape reshaped_shape = TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]);
+    const TensorInfo  reshaped_info  = weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(weights, &reshaped_info));
+
+    TensorShape      transposed_shape(reshaped_shape[1], reshaped_shape[0]);
+    const TensorInfo reshaped_t_info = reshaped_info.clone()->set_is_resizable(true).set_tensor_shape(transposed_shape);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&reshaped_info, &reshaped_t_info));
+
+    TensorShape gemm_output_shape(weights->dimension(idx_w) * weights->dimension(idx_h) * weights->dimension(idx_b),
+                                  input->dimension(idx_w),
+                                  input->dimension(idx_h),
+                                  input->dimension(idx_b));
+
+    TensorInfo gemm_output_info = reshaped_t_info.clone()->set_tensor_shape(gemm_output_shape).set_is_resizable(true);
+    GEMMInfo   gemm_info(false, false, true, input->dimension(idx_h), true);
+
+    if(is_quantized)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, &gemm_output_info.set_data_type(DataType::S32),
+                                                                           gemm_info));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true), &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info));
+    }
+
+    auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h),
+                                                    0, 0, deconv_info.stride().first, deconv_info.stride().second);
+    const TensorShape deconv_shape       = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
+    TensorInfo        col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
+
+    if(padded_input && is_quantized)
+    {
+        const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&col2im_output_info, nullptr,
+                                                                                                  &col2im_output_info.clone()->set_is_resizable(true).set_data_type(DataType::QASYMM8)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(DataType::QASYMM8), output, start_end.first, start_end.second));
+    }
+    else if(padded_input)
+    {
+        const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info, output, start_end.first, start_end.second));
+    }
+    else if(is_quantized)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&col2im_output_info, nullptr, output));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info));
+    }
+
+    return Status{};
+}
+
+void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(input->info(),
+                                                                  weights->info(),
+                                                                  bias != nullptr ? bias->info() : nullptr,
+                                                                  output->info(),
+                                                                  deconv_info));
+
+    _original_weights = weights;
+    _padded_input     = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
+    _is_nchw          = input->info()->data_layout() == DataLayout::NCHW;
+    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
+
+    const ICLTensor *input_to_use   = input;
+    const ICLTensor *weights_to_use = weights;
+
+    // If the data layout is NCHW, transform everything in NHWC. Another alternative could be to
+    // do an outer product in NCHW and then an accumulation through a reduction. This would have two
+    // drawbacks: first, the outer product is less efficient than a full GEMM. Second, the reduction
+    // might be slower than GEMM.
+    if(_is_nchw)
+    {
+        _memory_group.manage(&_permuted_input);
+        _permute_input_to_nhwc.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+
+        _permute_weights_to_nhwc.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+
+        input_to_use   = &_permuted_input;
+        weights_to_use = &_permuted_weights;
+    }
+
+    // Reshape the input weights. The weights will be reshaped only once during the call to prepare()
+    _reshaped_weights.allocator()->init(TensorInfo(TensorShape(weights_to_use->info()->dimension(0),
+                                                               weights_to_use->info()->dimension(1) * weights_to_use->info()->dimension(2) * weights_to_use->info()->dimension(3)),
+                                                   1,
+                                                   input->info()->data_type(), weights->info()->quantization_info()));
+
+    _reshape_weights.configure(weights_to_use, &_reshaped_weights);
+    _transpose_weights.configure(&_reshaped_weights, &_reshaped_weights_t);
+
+    const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    GEMMInfo     gemm_info(false, false, true, input->info()->dimension(idx_h), true);
+
+    // Configure output stage for asymmetric quantized types
+    if(_is_quantized)
+    {
+        _mm_gemmlowp.configure(input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, gemm_info);
+    }
+    else
+    {
+        _mm_gemm.configure(input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, gemm_info);
+    }
+
+    if(_is_nchw)
+    {
+        _permuted_input.allocator()->allocate();
+    }
+
+    ICLTensor *deconv_reshape_output = nullptr;
+    ICLTensor *slice_output          = nullptr;
+    ICLTensor *output_stage_output   = nullptr;
+
+    if(_padded_input && _is_quantized)
+    {
+        _memory_group.manage(&_slice_gemm_input);
+        _memory_group.manage(&_gemmlowp_final);
+        deconv_reshape_output = &_gemmlowp_final;
+        output_stage_output   = &_slice_gemm_input;
+        slice_output          = output;
+    }
+    else if(_padded_input)
+    {
+        _memory_group.manage(&_slice_gemm_input);
+        deconv_reshape_output = &_slice_gemm_input;
+        slice_output          = output;
+    }
+    else if(_is_quantized)
+    {
+        _memory_group.manage(&_gemmlowp_final);
+        deconv_reshape_output = &_gemmlowp_final;
+        output_stage_output   = output;
+    }
+    else
+    {
+        deconv_reshape_output = output;
+    }
+
+    // Configure a Col2Im call to reshape the output of GEMM
+    _deconv_reshape.configure(&_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info);
+    _gemm_output.allocator()->allocate();
+
+    if(_is_quantized)
+    {
+        float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / _gemmlowp_final.info()->quantization_info().scale;
+        int   output_multiplier(0);
+        int   output_shift(0);
+        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+        _gemmlowp_output_stage.configure(&_gemmlowp_final, nullptr, output_stage_output, output_multiplier, output_shift, _gemmlowp_final.info()->quantization_info().offset);
+        _gemmlowp_final.allocator()->allocate();
+    }
+
+    // If the input was padded, the output needs to be sliced.
+    if(_padded_input)
+    {
+        const auto start_end = compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw);
+        _slice_gemm.configure(&_slice_gemm_input, slice_output, start_end.first, start_end.second);
+        _slice_gemm_input.allocator()->allocate();
+    }
+}
+
+void CLGEMMDeconvolutionLayer::run()
+{
+    prepare();
+
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    if(_is_nchw)
+    {
+        _permute_input_to_nhwc.run();
+    }
+
+    if(_is_quantized)
+    {
+        _mm_gemmlowp.run();
+    }
+    else
+    {
+        _mm_gemm.run();
+    }
+
+    CLScheduler::get().enqueue(_deconv_reshape, false);
+
+    if(_is_quantized)
+    {
+        _gemmlowp_output_stage.run();
+    }
+
+    if(_padded_input)
+    {
+        _slice_gemm.run();
+    }
+}
+
+void CLGEMMDeconvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        if(_is_nchw)
+        {
+            _permuted_weights.allocator()->allocate();
+            _permute_weights_to_nhwc.run();
+        }
+
+        _reshaped_weights.allocator()->allocate();
+        _reshape_weights.run();
+
+        if(_is_nchw)
+        {
+            _permuted_weights.allocator()->free();
+        }
+
+        _reshaped_weights_t.allocator()->allocate();
+        _transpose_weights.run();
+
+        // Prepare gemm
+        if(!_is_quantized)
+        {
+            _mm_gemm.prepare();
+        }
+        else
+        {
+            _mm_gemmlowp.prepare();
+        }
+
+        // Free resources
+        if(!_reshaped_weights_t.is_used())
+        {
+            _reshaped_weights_t.allocator()->free();
+        }
+
+        _original_weights->mark_as_unused();
+        _is_prepared = true;
+    }
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 2a01db7..049db1d 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -31,7 +32,6 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
 
 namespace arm_compute
 {
@@ -40,17 +40,16 @@
 
 namespace
 {
-inline bool is_gemm_reshaped(unsigned int m, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+inline bool is_gemm_reshaped(bool reshape_b_only_on_first_run, GPUTarget gpu_target)
 {
-    return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (m > 1) && (reshape_b_only_on_first_run);
+    return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (reshape_b_only_on_first_run);
 }
 } // namespace
 
 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
       _mm_kernel(),
-      _mm_reshaped_kernel(),
-      _mtx_a_reshape_kernel(),
+      _mm_reshaped_only_rhs_kernel(),
       _mtx_b_reshape_kernel(),
       _mtx_a_reduction_kernel(),
       _mtx_b_reduction_kernel(),
@@ -58,7 +57,6 @@
       _offset_contribution_output_stage_kernel(),
       _vector_sum_col(),
       _vector_sum_row(),
-      _tmp_a(),
       _tmp_b(),
       _mm_result_s32(),
       _original_b(nullptr),
@@ -86,7 +84,6 @@
     const GPUTarget gpu_target = CLScheduler::get().target();
 
     // Set the target for the kernels
-    _mtx_a_reshape_kernel.set_target(gpu_target);
     _mm_kernel.set_target(gpu_target);
 
     const ICLTensor *matrix_a = a;
@@ -105,29 +102,21 @@
     const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
 
     // Check if we need to reshape the matrix A and matrix B
-    _is_gemm_reshaped = is_gemm_reshaped(m, _reshape_b_only_on_first_run, gpu_target);
+    _is_gemm_reshaped = is_gemm_reshaped(_reshape_b_only_on_first_run, gpu_target);
 
     if(_is_gemm_reshaped)
     {
-        // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
-        reinterpret_input_as_3d = false;
-
-        matrix_a = &_tmp_a;
         matrix_b = &_tmp_b;
 
-        _memory_group.manage(&_tmp_a);
         if(!_reshape_b_only_on_first_run)
         {
             _memory_group.manage(&_tmp_b);
         }
 
         // Pick up the GEMM configuration
-        std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+        std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
 
-        // Configure interleave kernel
-        _mtx_a_reshape_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
-
-        // Configure transpose kernel
+        // Configure reshape RHS kernel
         _mtx_b_reshape_kernel.configure(b, &_tmp_b, rhs_info);
     }
 
@@ -166,7 +155,7 @@
         if(_is_gemm_reshaped)
         {
             // Configure and tune matrix multiply kernel
-            _mm_reshaped_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+            _mm_reshaped_only_rhs_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
         }
         else
         {
@@ -185,7 +174,7 @@
         if(_is_gemm_reshaped)
         {
             // Configure and tune matrix multiply kernel
-            _mm_reshaped_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+            _mm_reshaped_only_rhs_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
         }
         else
         {
@@ -200,7 +189,6 @@
     // Allocate tensors
     if(_is_gemm_reshaped)
     {
-        _tmp_a.allocator()->allocate();
         if(!_reshape_b_only_on_first_run)
         {
             _tmp_b.allocator()->allocate();
@@ -231,11 +219,13 @@
     const ITensorInfo *matrix_a_info = a;
     const ITensorInfo *matrix_b_info = b;
 
-    TensorInfo        tmp_a_info{};
     TensorInfo        tmp_b_info{};
     GEMMRHSMatrixInfo rhs_info;
     GEMMLHSMatrixInfo lhs_info;
 
+    // Get the GPU target
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
     const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
     const unsigned int n                       = b->dimension(0);
@@ -243,35 +233,24 @@
     const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
     const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
 
-    bool reshape_matrices = is_gemm_reshaped(m, gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
-
-    // if reshape_matrices is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
-    if(reshape_matrices)
-    {
-        reinterpret_input_as_3d = false;
-    }
+    bool reshape_matrix_b = is_gemm_reshaped(gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
 
     const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
 
-    if(reshape_matrices)
+    if(reshape_matrix_b)
     {
-        matrix_a_info = &tmp_a_info;
         matrix_b_info = &tmp_b_info;
 
         // Pick up the GEMM configuration
-        std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+        std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
 
-        // Validate interleave kernel
-        auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
-
-        // Validate transpose kernel
-
+        // Validate reshape RHS kernel
         auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
     }
 
-    TensorInfo info_vector_sum_col, info_vector_sum_row;
+    TensorInfo info_vector_sum_col{};
+    TensorInfo info_vector_sum_row{};
 
     // Validate matrix B reduction kernel only if _a_offset is not equal to 0
     if(a_offset != 0)
@@ -295,13 +274,13 @@
     {
         TensorInfo mm_result_s32_info{};
 
-        if(reshape_matrices)
+        if(reshape_matrix_b)
         {
             // Output tensor auto inizialitation if not yet initialized
             auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
 
             // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
         }
         else
         {
@@ -322,22 +301,25 @@
     }
     else
     {
-        if(reshape_matrices)
+        if(reshape_matrix_b)
         {
             // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
         }
         else
         {
             // Validate matrix multiply
             ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, false, reshape_info));
         }
-        // Validate offset contribution kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
-                                                                                 a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                 b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                 c,
-                                                                                 a_offset, b_offset));
+        if(output->total_size() != 0)
+        {
+            // Validate offset contribution kernel
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
+                                                                                     a_offset == 0 ? nullptr : &info_vector_sum_col,
+                                                                                     b_offset == 0 ? nullptr : &info_vector_sum_row,
+                                                                                     c,
+                                                                                     a_offset, b_offset));
+        }
     }
 
     return Status{};
@@ -347,13 +329,10 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     if(_is_gemm_reshaped)
     {
-        // Run reshape matrix A
-        CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false);
-
         if(!_reshape_b_only_on_first_run)
         {
             // Run reshape matrix B
@@ -370,7 +349,7 @@
     // Run matrix multiply
     if(_is_gemm_reshaped)
     {
-        CLScheduler::get().enqueue(_mm_reshaped_kernel, false);
+        CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, false);
     }
     else
     {
@@ -393,8 +372,6 @@
         // Run offset contribution kernel
         CLScheduler::get().enqueue(_offset_contribution_kernel, true);
     }
-
-    _memory_group.release();
 }
 
 void CLGEMMLowpMatrixMultiplyCore::prepare()
@@ -422,4 +399,4 @@
         _is_prepared = true;
     }
 }
-} // namespace arm_compute
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLGaussian5x5.cpp b/src/runtime/CL/functions/CLGaussian5x5.cpp
index f30eee1..ea803e4 100644
--- a/src/runtime/CL/functions/CLGaussian5x5.cpp
+++ b/src/runtime/CL/functions/CLGaussian5x5.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,10 +62,8 @@
 {
     CLScheduler::get().enqueue(_border_handler, false);
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     CLScheduler::get().enqueue(_kernel_hor, false);
     CLScheduler::get().enqueue(_kernel_vert);
-
-    _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
index fd82769..b671b23 100644
--- a/src/runtime/CL/functions/CLGaussianPyramid.cpp
+++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -76,10 +76,10 @@
 
     if(num_levels > 1)
     {
-        _horizontal_border_handler = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
-        _vertical_border_handler   = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
-        _horizontal_reduction      = arm_compute::support::cpp14::make_unique<CLGaussianPyramidHorKernel[]>(num_levels - 1);
-        _vertical_reduction        = arm_compute::support::cpp14::make_unique<CLGaussianPyramidVertKernel[]>(num_levels - 1);
+        _horizontal_border_handler.resize(num_levels - 1);
+        _vertical_border_handler.resize(num_levels - 1);
+        _horizontal_reduction.resize(num_levels - 1);
+        _vertical_reduction.resize(num_levels - 1);
 
         // Apply half scale to the X dimension of the tensor shape
         TensorShape tensor_shape = pyramid->info()->tensor_shape();
@@ -153,8 +153,8 @@
 
     if(num_levels > 1)
     {
-        _gauss5x5      = arm_compute::support::cpp14::make_unique<CLGaussian5x5[]>(num_levels - 1);
-        _scale_nearest = arm_compute::support::cpp14::make_unique<CLScaleKernel[]>(num_levels - 1);
+        _gauss5x5.resize(num_levels - 1);
+        _scale_nearest.resize(num_levels - 1);
 
         PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
 

diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index c50132e..d712a23 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp

@@ -256,7 +256,7 @@
 void CLGenerateProposalsLayer::run()
 {
     // Acquire all the temporaries
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Compute all the anchors
     CLScheduler::get().enqueue(_compute_anchors_kernel, false);
@@ -277,8 +277,5 @@
     // Add dummy batch indexes
     CLScheduler::get().enqueue(_memset_kernel, true);
     CLScheduler::get().enqueue(_padded_copy_kernel, true);
-
-    // Release all the temporaries
-    _memory_group.release();
 }
 } // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
index 1470d5c..0931443 100644
--- a/src/runtime/CL/functions/CLHOGDescriptor.cpp
+++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -95,7 +95,7 @@
 
 void CLHOGDescriptor::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run gradient
     _gradient.run();
@@ -105,6 +105,4 @@
 
     // Run block normalization
     CLScheduler::get().enqueue(_block_norm);
-
-    _memory_group.release();
 }
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
index 51aeaed..e509fd8 100644
--- a/src/runtime/CL/functions/CLHOGGradient.cpp
+++ b/src/runtime/CL/functions/CLHOGGradient.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -71,13 +71,11 @@
 
 void CLHOGGradient::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run derivative
     _derivative.run();
 
     // Run magnitude/phase kernel
     CLScheduler::get().enqueue(_mag_phase);
-
-    _memory_group.release();
 }
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
index 8012c2f..f799d61 100644
--- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp
+++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -128,12 +128,11 @@
     _num_block_norm_kernel  = input_block_norm.size(); // Number of CLHOGBlockNormalizationKernel kernels to compute
     _num_hog_detect_kernel  = input_hog_detect.size(); // Number of CLHOGDetector functions to compute
 
-    _orient_bin_kernel = arm_compute::support::cpp14::make_unique<CLHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
-    _block_norm_kernel = arm_compute::support::cpp14::make_unique<CLHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
-    _hog_detect_kernel = arm_compute::support::cpp14::make_unique<CLHOGDetector[]>(_num_hog_detect_kernel);
-    _non_maxima_kernel = arm_compute::support::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
-    _hog_space         = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_orient_bin_kernel);
-    _hog_norm_space    = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_block_norm_kernel);
+    _orient_bin_kernel.resize(_num_orient_bin_kernel);
+    _block_norm_kernel.resize(_num_block_norm_kernel);
+    _hog_detect_kernel.resize(_num_hog_detect_kernel);
+    _hog_space.resize(_num_orient_bin_kernel);
+    _hog_norm_space.resize(_num_block_norm_kernel);
 
     // Allocate tensors for magnitude and phase
     TensorInfo info_mag(shape_img, Format::S16);
@@ -172,10 +171,10 @@
         _hog_space[i].allocator()->init(info_space);
 
         // Manage intermediate buffers
-        _memory_group.manage(_hog_space.get() + i);
+        _memory_group.manage(&_hog_space[i]);
 
         // Initialise orientation binning kernel
-        _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+        _orient_bin_kernel[i].configure(&_mag, &_phase, &_hog_space[i], multi_hog->model(idx_multi_hog)->info());
     }
 
     // Allocate intermediate tensors
@@ -193,10 +192,10 @@
         _hog_norm_space[i].allocator()->init(tensor_info);
 
         // Manage intermediate buffers
-        _memory_group.manage(_hog_norm_space.get() + i);
+        _memory_group.manage(&_hog_norm_space[i]);
 
         // Initialize block normalization kernel
-        _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+        _block_norm_kernel[i].configure(&_hog_space[idx_orient_bin], &_hog_norm_space[i], multi_hog->model(idx_multi_hog)->info());
     }
 
     // Allocate intermediate tensors
@@ -212,13 +211,13 @@
     {
         const size_t idx_block_norm = input_hog_detect[i];
 
-        _hog_detect_kernel[i].configure(_hog_norm_space.get() + idx_block_norm, multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i);
+        _hog_detect_kernel[i].configure(&_hog_norm_space[idx_block_norm], multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i);
     }
 
     detection_window_strides->unmap(CLScheduler::get().queue());
 
     // Configure non maxima suppression kernel
-    _non_maxima_kernel->configure(_detection_windows, min_distance);
+    _non_maxima_kernel.configure(_detection_windows, min_distance);
 
     // Allocate intermediate tensors
     for(size_t i = 0; i < _num_block_norm_kernel; ++i)
@@ -231,7 +230,7 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Reset detection window
     _detection_windows->clear();
@@ -242,13 +241,13 @@
     // Run orientation binning kernel
     for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
     {
-        CLScheduler::get().enqueue(*(_orient_bin_kernel.get() + i), false);
+        CLScheduler::get().enqueue(_orient_bin_kernel[i], false);
     }
 
     // Run block normalization kernel
     for(size_t i = 0; i < _num_block_norm_kernel; ++i)
     {
-        CLScheduler::get().enqueue(*(_block_norm_kernel.get() + i), false);
+        CLScheduler::get().enqueue(_block_norm_kernel[i], false);
     }
 
     // Run HOG detector kernel
@@ -262,9 +261,7 @@
     {
         // Map detection windows array before computing non maxima suppression
         _detection_windows->map(CLScheduler::get().queue(), true);
-        Scheduler::get().schedule(_non_maxima_kernel.get(), Window::DimY);
+        Scheduler::get().schedule(&_non_maxima_kernel, Window::DimY);
         _detection_windows->unmap(CLScheduler::get().queue());
     }
-
-    _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
index 65ce7de..67f550d3 100644
--- a/src/runtime/CL/functions/CLHarrisCorners.cpp
+++ b/src/runtime/CL/functions/CLHarrisCorners.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,7 +55,7 @@
       _gy(),
       _score(),
       _nonmax(),
-      _corners_list(nullptr),
+      _corners_list(),
       _num_corner_candidates(0),
       _corners(nullptr)
 {
@@ -84,7 +84,7 @@
     _score.allocator()->init(info_f32);
     _nonmax.allocator()->init(info_f32);
 
-    _corners_list = arm_compute::support::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+    _corners_list.resize(shape.x() * shape.y());
 
     // Manage intermediate buffers
     _memory_group.manage(&_gx);
@@ -146,20 +146,20 @@
     _score.allocator()->allocate();
 
     // Init corner candidates kernel
-    _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
+    _candidates.configure(&_nonmax, _corners_list.data(), &_num_corner_candidates);
 
     // Allocate intermediate buffers
     _nonmax.allocator()->allocate();
 
     // Init euclidean distance
-    _sort_euclidean.configure(_corners_list.get(), _corners, &_num_corner_candidates, min_dist);
+    _sort_euclidean.configure(_corners_list.data(), _corners, &_num_corner_candidates, min_dist);
 }
 
 void CLHarrisCorners::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Init to 0 number of corner candidates
     _num_corner_candidates = 0;
@@ -185,6 +185,4 @@
     _corners->map(CLScheduler::get().queue(), true);
     Scheduler::get().schedule(&_sort_euclidean, Window::DimY);
     _corners->unmap(CLScheduler::get().queue());
-
-    _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 2e3c6d7..136cb5e 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -74,11 +74,9 @@
 
 void CLL2NormalizeLayer::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     _reduce_func.run();
     CLScheduler::get().enqueue(_normalize_kernel, true);
-
-    _memory_group.release();
 }
 } // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index f01b1b8..4606a66 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp

@@ -43,10 +43,11 @@
       _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _gemm_cell_state2(), _transpose_cell_state(), _accum_cell_state1(), _accum_cell_state2(),
       _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), _gemm_output(), _pixelwise_mul_output_state1(), _transpose_output(),
       _accum_output1(), _accum_output2(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _gemm_output_state(), _accum_output_state(),
-      _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _input_gate_out5(),
-      _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(),
-      _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _output5(), _cell_state_activation(), _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false),
-      _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false)
+      _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(),
+      _ones_memset_kernel(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(),
+      _forget_gate_out5(), _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(),
+      _cell_state_activation(), _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false),
+      _is_prepared(false)
 {
 }
 
@@ -93,25 +94,38 @@
                                                      lstm_params_info, activation_info, cell_threshold, projection_threshold));
 
     const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape();
-
     // Configure block that calculates the forget gate
     // forget_gate = Activation(input * input_to_forget_weights + output_state_in * recurrent_to_forget_weights + PixelWiseMul(cell_state, cell_to_forget_weights) + forget_gate_bias)
-    TensorShape forget_gate1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+    // We optimize this as follows:
+    // forget_gate = Activation( (input,output_state_in) * (input_to_forget_weights,recurrent_to_forget_weights) + PixelWiseMul(cell_state, cell_to_forget_weights) + forget_gate_bias
     _forget_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-    _forget_gate_out2.allocator()->init(TensorInfo(forget_gate1_shape, 1, input->info()->data_type()));
     _forget_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
     _forget_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
-    _memory_group.manage(&_forget_gate_out1);
-    _fully_connected_forget_gate.configure(input, input_to_forget_weights, forget_gate_bias, &_forget_gate_out1);
+    std::vector<const ICLTensor *> inputs_vector;
+    inputs_vector.emplace_back(input);
+    inputs_vector.emplace_back(output_state_in);
+    const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
+    _forget_gate_out2.allocator()->init(TensorInfo(concat_shape, 1, input->info()->data_type()));
+
     _memory_group.manage(&_forget_gate_out2);
-    _transpose_forget_gate.configure(recurrent_to_forget_weights, &_forget_gate_out2);
-    _memory_group.manage(&_forget_gate_out3);
-    _gemm_forget_gate.configure(output_state_in, &_forget_gate_out2, nullptr, &_forget_gate_out3, 1.f, 0.f);
-    _forget_gate_out2.allocator()->allocate();
+    _concat_inputs_forget_gate.configure(input, output_state_in, &_forget_gate_out2);
+
+    std::vector<const ICLTensor *> weights_vector;
+
+    weights_vector.emplace_back(input_to_forget_weights);
+    weights_vector.emplace_back(recurrent_to_forget_weights);
+    const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0);
+    _forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type()));
+
+    _concat_weights_forget_gate.configure(input_to_forget_weights, recurrent_to_forget_weights, &_forget_gate_out6);
+
     _memory_group.manage(&_forget_gate_out5);
-    _accum_forget_gate1.configure(ArithmeticOperation::ADD, &_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
-    _forget_gate_out1.allocator()->allocate();
+    _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, forget_gate_bias, &_forget_gate_out5);
+    _memory_group.manage(&_forget_gate_out1);
+    _memory_group.manage(&_forget_gate_out3);
+    _forget_gate_out6.allocator()->allocate();
+
     CLTensor *forget_gate_out = &_forget_gate_out5;
     if(lstm_params.has_peephole_opt())
     {
@@ -134,43 +148,46 @@
     // Configure block that calculates the input gate
     // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
     // input_gate = 1 - forget_gate, with CIFG
+    // We optimize this as follows:
+    // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
     _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
     CLTensor *input_gate_out = &_input_gate_out1;
     if(lstm_params.has_cifg_opt())
     {
         _memory_group.manage(&_input_gate_out1);
         _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+        _ones_memset_kernel.configure(&_ones, PixelValue(1, _ones.info()->data_type()));
         _subtract_input_gate.configure(ArithmeticOperation::SUB, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
         _ones.allocator()->allocate();
         _run_cifg_opt = true;
     }
     else
     {
-        TensorShape input_gate_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
-
-        _input_gate_out2.allocator()->init(TensorInfo(input_gate_shape, 1, input->info()->data_type()));
         _input_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _input_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-        _input_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+        std::vector<const ICLTensor *> lstm_weights;
+        lstm_weights.emplace_back(lstm_params.input_to_input_weights());
+        lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
+        TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+        _input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type()));
+
+        _concat_weights_input_gate.configure(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
 
         _memory_group.manage(&_input_gate_out1);
-        _fully_connected_input_gate.configure(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &_input_gate_out1);
-        _memory_group.manage(&_input_gate_out2);
-        _transpose_input_gate.configure(lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
+
         _memory_group.manage(&_input_gate_out3);
-        _gemm_input_gate.configure(output_state_in, &_input_gate_out2, nullptr, &_input_gate_out3, 1.f, 0.f);
+        _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, lstm_params.input_gate_bias(), &_input_gate_out3);
         _input_gate_out2.allocator()->allocate();
-        _memory_group.manage(&_input_gate_out4);
-        _accum_input_gate1.configure(ArithmeticOperation::ADD, &_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
-        _input_gate_out3.allocator()->allocate();
-        input_gate_out = &_input_gate_out4;
+
+        input_gate_out = &_input_gate_out3;
         if(_run_peephole_opt)
         {
-            _memory_group.manage(&_input_gate_out5);
-            _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-            _accum_input_gate2.configure(&_input_gate_out4, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _memory_group.manage(&_input_gate_out4);
+            _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+            _accum_input_gate2.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _input_gate_out3.allocator()->allocate();
             _input_gate_out4.allocator()->allocate();
-            _input_gate_out5.allocator()->allocate();
             input_gate_out = &_input_gate_out1;
         }
         else
@@ -215,35 +232,39 @@
 
     // Configure block that calculates the output
     // output_state_out = Activation(input * input_to_output_weights + output_state_in * recurrent_to_output_weights + PixelWiseMul(cell_state, cell_to_output_weights) + output_gate_bias)
-    TensorShape output1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+    // We optimize this as follows:
+    // output_state_out = Activation( (input,output_state_in) * (input_to_output_weights, recurrent_to_output_weights) + PixelWiseMul(cell_state, cell_to_output_weights) + output_gate_bias)
     _output1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-    _output2.allocator()->init(TensorInfo(output1_shape, 1, input->info()->data_type()));
-    _output3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-    _output5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _output4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    std::vector<const ICLTensor *> in_out_weights;
+    in_out_weights.emplace_back(input_to_output_weights);
+    in_out_weights.emplace_back(recurrent_to_output_weights);
+    TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+    _output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type()));
+
+    _concat_weights_output.configure(input_to_output_weights, recurrent_to_output_weights, &_output2);
 
     _memory_group.manage(&_output1);
-    _fully_connected_output.configure(input, input_to_output_weights, output_gate_bias, &_output1);
-    _memory_group.manage(&_output2);
-    _transpose_output.configure(recurrent_to_output_weights, &_output2);
-    _memory_group.manage(&_output3);
-    _gemm_output.configure(output_state_in, &_output2, nullptr, &_output3, 1.f, 0.f);
+    _memory_group.manage(&_output4);
+
+    _fully_connected_output.configure(&_forget_gate_out2, &_output2, output_gate_bias, &_output4);
+
     _output2.allocator()->allocate();
-    _memory_group.manage(&_output5);
-    _accum_output1.configure(ArithmeticOperation::ADD, &_output1, &_output3, &_output5, ConvertPolicy::SATURATE);
-    _output3.allocator()->allocate();
-    CLTensor *output_gate_out = &_output5;
+    _forget_gate_out2.allocator()->allocate();
+
+    CLTensor *output_gate_out = &_output4;
     if(lstm_params.has_peephole_opt())
     {
-        _output4.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
+        _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
 
-        _memory_group.manage(&_output4);
-        _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-        _accum_output2.configure(&_output5, &_output4, &_output1, ConvertPolicy::SATURATE);
-        _output5.allocator()->allocate();
+        _memory_group.manage(&_output3);
+        _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+        _accum_output2.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
+        _output4.allocator()->allocate();
         output_gate_out = &_output1;
 
         // Allocate intermediate buffers
-        _output4.allocator()->allocate();
+        _output3.allocator()->allocate();
     }
     else
     {
@@ -369,8 +390,15 @@
 
     // Validate forget gate
     ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, &forget_gate));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &forget_gate, 1.f, 0.f, GEMMInfo()));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+
+    std::vector<const ITensorInfo *> inputs_vector;
+    inputs_vector.emplace_back(input);
+    inputs_vector.emplace_back(output_state_in);
+    const TensorShape concat_shape       = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
+    TensorInfo        forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type());
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input, output_state_in, &forget_gate_concat));
+
     if(lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
@@ -388,9 +416,15 @@
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
 
+        std::vector<const ITensorInfo *> lstm_weights;
+        lstm_weights.emplace_back(lstm_params.input_to_input_weights());
+        lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
+        TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+        TensorInfo  lstm_gate_concat          = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
+        ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &lstm_gate_concat));
+
         ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &input_gate));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &input_gate, 1.f, 0.f, GEMMInfo()));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+
         if(lstm_params.has_peephole_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
@@ -419,10 +453,15 @@
                                                                                                                     cell_threshold)));
     }
 
+    std::vector<const ITensorInfo *> in_out_weights;
+    in_out_weights.emplace_back(input_to_output_weights);
+    in_out_weights.emplace_back(recurrent_to_output_weights);
+    TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+    TensorInfo  in_out_gate_concat          = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
+    ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input_to_output_weights, recurrent_to_output_weights, &in_out_gate_concat));
     // Validate output gate tmp
     ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, output_gate_bias, &output_gate_tmp));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &output_gate_tmp, 1.f, 0.f, GEMMInfo()));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+
     if(lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
@@ -464,12 +503,13 @@
 
 void CLLSTMLayer::run()
 {
-    _memory_group.acquire();
+    prepare();
+
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    CLScheduler::get().enqueue(_concat_inputs_forget_gate);
 
     _fully_connected_forget_gate.run();
-    CLScheduler::get().enqueue(_transpose_forget_gate);
-    _gemm_forget_gate.run();
-    CLScheduler::get().enqueue(_accum_forget_gate1);
 
     if(_run_peephole_opt)
     {
@@ -480,24 +520,13 @@
 
     if(_run_cifg_opt)
     {
-        _ones.map(true);
-        if(_ones.info()->data_type() == DataType::F16)
-        {
-            std::fill_n(reinterpret_cast<half *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
-        }
-        else
-        {
-            std::fill_n(reinterpret_cast<float *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
-        }
-        _ones.unmap();
+        CLScheduler::get().enqueue(_ones_memset_kernel);
         CLScheduler::get().enqueue(_subtract_input_gate);
     }
     else
     {
         _fully_connected_input_gate.run();
-        CLScheduler::get().enqueue(_transpose_input_gate);
-        _gemm_input_gate.run();
-        CLScheduler::get().enqueue(_accum_input_gate1);
+
         if(_run_peephole_opt)
         {
             CLScheduler::get().enqueue(_pixelwise_mul_input_gate);
@@ -521,9 +550,6 @@
     }
 
     _fully_connected_output.run();
-    CLScheduler::get().enqueue(_transpose_output);
-    _gemm_output.run();
-    CLScheduler::get().enqueue(_accum_output1);
 
     if(_run_peephole_opt)
     {
@@ -548,6 +574,18 @@
     CLScheduler::get().enqueue(_copy_output);
 
     _concat_scratch_buffer.run();
+}
 
-    _memory_group.release();
+void CLLSTMLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        CLScheduler::get().enqueue(_concat_weights_forget_gate);
+        if(!_run_cifg_opt)
+        {
+            CLScheduler::get().enqueue(_concat_weights_input_gate);
+        }
+        CLScheduler::get().enqueue(_concat_weights_output);
+        _is_prepared = true;
+    }
 }

diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
index 559b57f..a118518 100644
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,8 +70,8 @@
     // Create Gaussian Pyramid function
     _gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
 
-    _convf = arm_compute::support::cpp14::make_unique<CLGaussian5x5[]>(_num_levels);
-    _subf  = arm_compute::support::cpp14::make_unique<CLArithmeticSubtraction[]>(_num_levels);
+    _convf.resize(_num_levels);
+    _subf.resize(_num_levels);
 
     for(unsigned int i = 0; i < _num_levels; ++i)
     {

diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
index 911c9b3..13116bf 100644
--- a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
+++ b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,8 +63,8 @@
     _tmp_pyr.init(pyramid_info);
 
     // Allocate add and scale functions. Level 0 does not need to be scaled.
-    _addf   = arm_compute::support::cpp14::make_unique<CLArithmeticAddition[]>(num_levels);
-    _scalef = arm_compute::support::cpp14::make_unique<CLScale[]>(num_levels - 1);
+    _addf.resize(num_levels);
+    _scalef.resize(num_levels - 1);
 
     const size_t last_level = num_levels - 1;
 
@@ -85,7 +85,7 @@
 
 void CLLaplacianReconstruct::run()
 {
-    ARM_COMPUTE_ERROR_ON_MSG(_addf == nullptr, "Unconfigured function");
+    ARM_COMPUTE_ERROR_ON_MSG(_addf.empty(), "Unconfigured function");
 
     const size_t last_level = _tmp_pyr.info()->num_levels() - 1;
 

diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index 5c6bef9..3e99dde 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -176,7 +176,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run input reshaping
     CLScheduler::get().enqueue(_input_im2col_kernel);
@@ -186,8 +186,6 @@
 
     // Reshape output matrix
     CLScheduler::get().enqueue(_output_col2im_kernel, false);
-
-    _memory_group.release();
 }
 
 void CLLocallyConnectedLayer::prepare()

diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp
index 157f306..8517b59 100644
--- a/src/runtime/CL/functions/CLMeanStdDev.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDev.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -104,7 +104,7 @@
 template <typename T>
 void CLMeanStdDev::run_float()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Perform reduction on x-axis
     _reduction_operation_mean.run();
@@ -140,8 +140,6 @@
         _reduction_output_stddev.unmap();
     }
     _reduction_output_mean.unmap();
-
-    _memory_group.release();
 }
 
 void CLMeanStdDev::run_int()

diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp
index d00b1b5..a013a1f 100644
--- a/src/runtime/CL/functions/CLOpticalFlow.cpp
+++ b/src/runtime/CL/functions/CLOpticalFlow.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -84,12 +84,12 @@
     const int   old_values_list_length = list_length * window_dimension * window_dimension;
 
     // Create kernels and tensors
-    _tracker_init_kernel   = arm_compute::support::cpp14::make_unique<CLLKTrackerInitKernel[]>(_num_levels);
-    _tracker_stage0_kernel = arm_compute::support::cpp14::make_unique<CLLKTrackerStage0Kernel[]>(_num_levels);
-    _tracker_stage1_kernel = arm_compute::support::cpp14::make_unique<CLLKTrackerStage1Kernel[]>(_num_levels);
-    _func_scharr           = arm_compute::support::cpp14::make_unique<CLScharr3x3[]>(_num_levels);
-    _scharr_gx             = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_levels);
-    _scharr_gy             = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_levels);
+    _tracker_init_kernel.resize(_num_levels);
+    _tracker_stage0_kernel.resize(_num_levels);
+    _tracker_stage1_kernel.resize(_num_levels);
+    _func_scharr.resize(_num_levels);
+    _scharr_gx.resize(_num_levels);
+    _scharr_gy.resize(_num_levels);
 
     // Create internal keypoint arrays
     _old_points_internal = arm_compute::support::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
@@ -118,8 +118,8 @@
         _scharr_gy[i].allocator()->init(tensor_info);
 
         // Manage intermediate buffers
-        _memory_group.manage(_scharr_gx.get() + i);
-        _memory_group.manage(_scharr_gy.get() + i);
+        _memory_group.manage(&_scharr_gx[i]);
+        _memory_group.manage(&_scharr_gy[i]);
 
         // Init Scharr kernel
         _func_scharr[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
@@ -149,7 +149,7 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     for(unsigned int level = _num_levels; level > 0; --level)
     {
@@ -167,6 +167,4 @@
     }
 
     CLScheduler::get().enqueue(_tracker_finalize_kernel, true);
-
-    _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index 3aa1b1e..99e3121 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp

@@ -25,39 +25,293 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "support/ToolchainSupport.h"
 
 namespace arm_compute
 {
 CLPadLayer::CLPadLayer()
-    : _copy_kernel(), _fillborder_kernel(), _memset_kernel()
+    : _copy_kernel(), _mode(), _padding(), _memset_kernel(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results()
 {
 }
 
-void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value)
+void CLPadLayer::configure_constant_mode(ICLTensor *input, ICLTensor *output, const PaddingList &padding, const PixelValue constant_value)
 {
-    // Copy the input to the output
-    _copy_kernel.configure(input, output, padding);
-
-    // Set the pages of the output to zero
+    // Set the pages of the output to the constant_value.
     _memset_kernel.configure(output, constant_value);
 
-    // Fill padding on the first two dimensions with zeros
-    _fillborder_kernel.configure(input, input->info()->padding(), BorderMode::CONSTANT, constant_value);
+    // Fill out padding list with zeroes.
+    PaddingList padding_extended = padding;
+    for(size_t i = padding.size(); i < TensorShape::num_max_dimensions; i++)
+    {
+        padding_extended.emplace_back(PaddingInfo{ 0, 0 });
+    }
+
+    // Create a window within the output tensor where the input will be copied.
+    Window copy_window = Window();
+    for(uint32_t i = 0; i < output->info()->num_dimensions(); ++i)
+    {
+        copy_window.set(i, Window::Dimension(padding_extended[i].first, padding_extended[i].first + input->info()->dimension(i), 1));
+    }
+    // Copy the input to the output, leaving the padding filled with the constant_value.
+    _copy_kernel.configure(input, output, PaddingList(), &copy_window);
 }
 
-Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value)
+void CLPadLayer::configure_reflect_symmetric_mode(ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(input, constant_value));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, padding));
+    int64_t last_padding_dimension = _padding.size() - 1;
+    // Reflecting can be performed by effectively unfolding the input as follows:
+    // For each dimension starting at DimX:
+    //      Create a before and after slice, which values depend on the selected padding mode
+    //      Concatenate the before and after padding with the tensor to be padded
 
+    // Two strided slice functions will be required for each dimension padded as well as a
+    // concatenate function and the tensors to hold the temporary results.
+    _slice_functions.resize(2 * _num_dimensions);
+    _slice_results.resize(2 * _num_dimensions);
+    _concat_functions.resize(_num_dimensions);
+    _concat_results.resize(_num_dimensions - 1);
+
+    Coordinates starts_before{};
+    Coordinates ends_before{};
+    Coordinates starts_after{};
+    Coordinates ends_after{};
+    Coordinates strides{};
+    ICLTensor *prev = input;
+    for(uint32_t i = 0; i < _num_dimensions; ++i)
+    {
+        // Values in strides from the previous dimensions need to be set to 1 to avoid reversing again.
+        if(i > 0)
+        {
+            strides.set(i - 1, 1);
+        }
+
+        if(_padding[i].first > 0 || _padding[i].second > 0)
+        {
+            // Set the starts, ends, and strides values for the current dimension.
+            // Due to the bit masks passed to strided slice, the values below the current dimension in
+            // starts and ends will be ignored so do not need to be modified.
+            if(_mode == PaddingMode::REFLECT)
+            {
+                starts_before.set(i, _padding[i].first);
+                ends_before.set(i, 0);
+                starts_after.set(i, input->info()->dimension(i) - 2);
+                ends_after.set(i, input->info()->dimension(i) - _padding[i].second - 2);
+                strides.set(i, -1);
+            }
+            else
+            {
+                starts_before.set(i, _padding[i].first - 1);
+                ends_before.set(i, -1);
+                starts_after.set(i, input->info()->dimension(i) - 1);
+                ends_after.set(i, input->info()->dimension(i) - _padding[i].second - 1);
+                strides.set(i, -1);
+            }
+
+            // Strided slice wraps negative indexes around to the end of the range,
+            // instead this should indicate use of the full range and so the bit mask will be modified.
+            const int32_t begin_mask_before = starts_before[i] < 0 ? ~0 : ~(1u << i);
+            const int32_t end_mask_before   = ends_before[i] < 0 ? ~0 : ~(1u << i);
+            const int32_t begin_mask_after  = starts_after[i] < 0 ? ~0 : ~(1u << i);
+            const int32_t end_mask_after    = ends_after[i] < 0 ? ~0 : ~(1u << i);
+
+            // Reflect the input values for the padding before and after the input.
+            std::vector<ICLTensor *> concat_vector;
+            if(_padding[i].first > 0)
+            {
+                if(i < prev->info()->num_dimensions())
+                {
+                    _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, begin_mask_before, end_mask_before);
+                    concat_vector.push_back(&_slice_results[2 * i]);
+                }
+                else
+                {
+                    // Performing the slice is unnecessary if the result would simply be a copy of the tensor.
+                    concat_vector.push_back(prev);
+                }
+            }
+            concat_vector.push_back(prev);
+            if(_padding[i].second > 0)
+            {
+                if(i < prev->info()->num_dimensions())
+                {
+                    _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, strides, begin_mask_after, end_mask_after);
+                    concat_vector.push_back(&_slice_results[2 * i + 1]);
+                }
+                else
+                {
+                    // Performing the slice is unnecessary if the result would simply be a copy of the tensor.
+                    concat_vector.push_back(prev);
+                }
+            }
+            // Concatenate the padding before and after with the input.
+            ICLTensor *out = (static_cast<int32_t>(i) == last_padding_dimension) ? output : &_concat_results[i];
+            _concat_functions[i].configure(concat_vector, out, i);
+            prev = out;
+        }
+    }
+    for(uint32_t i = 0; i < _num_dimensions; ++i)
+    {
+        if((static_cast<int32_t>(i) != last_padding_dimension))
+        {
+            _concat_results[i].allocator()->allocate();
+        }
+        _slice_results[2 * i].allocator()->allocate();
+        _slice_results[2 * i + 1].allocator()->allocate();
+    }
+}
+
+void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+{
+    ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
+
+    _padding = padding;
+    _mode    = mode;
+
+    TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding);
+
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(padded_shape));
+
+    // Find the last dimension requiring padding so that it is known when to write to output and whether any padding is applied.
+    int64_t last_padding_dimension = _padding.size() - 1;
+    for(; last_padding_dimension >= 0; --last_padding_dimension)
+    {
+        if(_padding[last_padding_dimension].first > 0 || _padding[last_padding_dimension].second > 0)
+        {
+            break;
+        }
+    }
+    _num_dimensions = last_padding_dimension + 1;
+    if(_num_dimensions > 0)
+    {
+        switch(_mode)
+        {
+            case PaddingMode::CONSTANT:
+            {
+                configure_constant_mode(input, output, padding, constant_value);
+                break;
+            }
+            case PaddingMode::REFLECT:
+            case PaddingMode::SYMMETRIC:
+            {
+                configure_reflect_symmetric_mode(input, output);
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("Padding mode not supported.");
+        }
+    }
+    else
+    {
+        // Copy the input to the whole output if no padding is applied
+        _copy_kernel.configure(input, output);
+    }
+}
+
+Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > input->num_dimensions());
+
+    TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
+
+    // Use CLCopyKernel and CLMemsetKernel to validate all padding modes as this includes all of the shape and info validation.
+    PaddingList padding_extended = padding;
+    for(size_t i = padding.size(); i < TensorShape::num_max_dimensions; i++)
+    {
+        padding_extended.emplace_back(PaddingInfo{ 0, 0 });
+    }
+
+    Window copy_window = Window();
+    for(uint32_t i = 0; i < padded_shape.num_dimensions(); ++i)
+    {
+        copy_window.set(i, Window::Dimension(padding_extended[i].first, padding_extended[i].first + input->dimension(i), 1));
+    }
+    if(output->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(output, input);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, PaddingList(), &copy_window));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, constant_value));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, &input->clone()->set_tensor_shape(padded_shape), PaddingList(), &copy_window));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(&input->clone()->set_tensor_shape(padded_shape), constant_value));
+    }
+
+    switch(mode)
+    {
+        case PaddingMode::CONSTANT:
+        {
+            break;
+        }
+        case PaddingMode::REFLECT:
+        case PaddingMode::SYMMETRIC:
+        {
+            for(uint32_t i = 0; i < padding.size(); ++i)
+            {
+                if(mode == PaddingMode::REFLECT)
+                {
+                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first >= input->dimension(i));
+                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second >= input->dimension(i));
+                }
+                else
+                {
+                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first > input->dimension(i));
+                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second > input->dimension(i));
+                }
+            }
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Invalid mode");
+        }
+    }
     return Status{};
 }
 
 void CLPadLayer::run()
 {
-    CLScheduler::get().enqueue(_memset_kernel, false);
-    CLScheduler::get().enqueue(_fillborder_kernel, false);
-    CLScheduler::get().enqueue(_copy_kernel, true);
+    if(_num_dimensions > 0)
+    {
+        switch(_mode)
+        {
+            case PaddingMode::CONSTANT:
+            {
+                CLScheduler::get().enqueue(_memset_kernel, false);
+                CLScheduler::get().enqueue(_copy_kernel, true);
+                break;
+            }
+            case PaddingMode::REFLECT:
+            case PaddingMode::SYMMETRIC:
+            {
+                for(uint32_t i = 0; i < _num_dimensions; ++i)
+                {
+                    if(_padding[i].first > 0 || _padding[i].second > 0)
+                    {
+                        if(_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0)
+                        {
+                            _slice_functions[2 * i].run();
+                        }
+                        if(_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0)
+                        {
+                            _slice_functions[2 * i + 1].run();
+                        }
+                        CLScheduler::get().sync();
+                        _concat_functions[i].run();
+                        CLScheduler::get().sync();
+                    }
+                }
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("Padding mode not supported.");
+        }
+    }
+    else
+    {
+        CLScheduler::get().enqueue(_copy_kernel, true);
+    }
 }
 } // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index b4c20db..959464c 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,8 +29,8 @@
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
                                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
@@ -54,3 +54,26 @@
 {
     return CLPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy);
 }
+
+void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLComplexPixelWiseMultiplicationKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+
+    if(output->info()->dimension(0) > 1)
+    {
+        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+        if(broadcasted_info->info()->dimension(0) == 1)
+        {
+            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+        }
+    }
+}
+
+Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return CLComplexPixelWiseMultiplicationKernel::validate(input1, input2, output);
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
index a13859c..df10e1e 100644
--- a/src/runtime/CL/functions/CLQuantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,54 +21,22 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
 
-#include "arm_compute/core/Error.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
+#include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-
-CLQuantizationLayer::CLQuantizationLayer()
-    : _quantize_kernel(), _min_max_kernel(), _min_max()
+namespace arm_compute
 {
+void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLQuantizationLayerKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
 }
 
 Status CLQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-
-    TensorInfo min_max{ input->num_channels(), input->data_type() };
-    ARM_COMPUTE_RETURN_ON_ERROR(CLMinMaxLayerKernel::validate(input, &min_max));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayerKernel::validate(input, output, &min_max));
-
-    return Status{};
+    return CLQuantizationLayerKernel::validate(input, output);
 }
-
-void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Configure min-max kernel. _min_max tensor will be auto-configured within the kernel.
-    _min_max_kernel.configure(input, &_min_max);
-
-    // Configure quantize kernel
-    _quantize_kernel.configure(input, output, &_min_max);
-
-    // Allocate min_max tensor
-    _min_max.allocator()->allocate();
-}
-
-void CLQuantizationLayer::run()
-{
-    cl::CommandQueue q = CLScheduler::get().queue();
-
-    // Reset min and max
-    _min_max_kernel.reset(q);
-
-    // Run min-max kernel
-    CLScheduler::get().enqueue(_min_max_kernel, false);
-
-    // Run quantize kernel
-    CLScheduler::get().enqueue(_quantize_kernel, false);
-}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 63f00ac..19eb69f 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -105,7 +105,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     _fully_connected_kernel.run();
     _gemm_state_f.run();
@@ -114,8 +114,6 @@
 
     // copy hidden out to output
     CLScheduler::get().enqueue(_copy_kernel);
-
-    _memory_group.release();
 }
 
 void CLRNNLayer::prepare()

diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index b2d0f81..a3634cd 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp

@@ -23,6 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLReduceMean.h"
 
+#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
 #include "arm_compute/core/Types.h"
@@ -40,10 +41,10 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
-    _reduction_ops     = reduction_axis.num_dimensions();
-    _reduction_kernels = arm_compute::support::cpp14::make_unique<CLReductionOperation[]>(_reduction_ops);
-    _reduced_outs      = arm_compute::support::cpp14::make_unique<CLTensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
-    _keep_dims         = keep_dims;
+    _reduction_ops = reduction_axis.num_dimensions();
+    _reduction_kernels.resize(_reduction_ops);
+    _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
+    _keep_dims = keep_dims;
 
     Coordinates axis_local = reduction_axis;
     const int   input_dims = input->info()->num_dimensions();
@@ -57,9 +58,9 @@
     // Perform reduction for every axis
     for(unsigned int i = 0; i < _reduction_ops; ++i)
     {
-        TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
+        TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
         out_shape.set(axis_local[i], 1);
-        auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
+        auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
 
         if(i == _reduction_ops - 1 && keep_dims)
         {
@@ -68,8 +69,8 @@
         else
         {
             _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
-            _memory_group.manage(_reduced_outs.get() + i);
-            _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i], ReductionOperation::MEAN_SUM);
+            _memory_group.manage(&_reduced_outs[i]);
+            _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
         }
     }
 
@@ -92,13 +93,15 @@
             out_shape.remove_dimension(axis_local[i] - i);
         }
         auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
-        _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
+        _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
     }
 }
 
 Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
 
     TensorShape out_shape = input->tensor_shape();
@@ -140,7 +143,7 @@
 
 void CLReduceMean::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     for(unsigned int i = 0; i < _reduction_ops; ++i)
     {
@@ -151,6 +154,5 @@
     {
         _reshape.run();
     }
-    _memory_group.release();
 }
 } // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 3d82e3f..9f99d2d 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp

@@ -71,7 +71,7 @@
     else
     {
         // Create temporary tensor infos
-        auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
+        std::vector<TensorInfo> sums_vector(num_of_stages - 1);
 
         // Create intermediate tensor info
         TensorShape shape{ input->tensor_shape() };
@@ -110,17 +110,17 @@
         }
 
         // Validate ReductionOperation only on first kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, sums_vector.get(), axis, first_kernel_op));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, &sums_vector[0], axis, first_kernel_op));
 
         // Validate ReductionOperation on intermediate stages
         for(unsigned int i = 1; i < num_of_stages - 1; ++i)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, intermediate_kernel_op));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[i - 1], &sums_vector[i], axis, intermediate_kernel_op));
         }
 
         // Validate ReductionOperation on the last stage
         const unsigned int last_stage = num_of_stages - 1;
-        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input->dimension(0)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[last_stage - 1], output, axis, last_kernel_op, input->dimension(0)));
     }
 
     return Status{};
@@ -133,7 +133,7 @@
     _is_serial      = is_data_type_quantized(input->info()->data_type()) || axis != 0;
 
     // Configure reduction operation kernels
-    _reduction_kernels_vector = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel[]>(_num_of_stages);
+    _reduction_kernels_vector.resize(_num_of_stages);
 
     // Create temporary tensors
     if(_is_serial)
@@ -142,8 +142,8 @@
     }
     else
     {
-        _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
-        _results_vector         = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
+        _border_handlers_vector.resize(_num_of_stages);
+        _results_vector.resize(_num_of_stages - 1);
         TensorShape shape{ input->info()->tensor_shape() };
         for(unsigned int i = 0; i < _num_of_stages - 1; i++)
         {
@@ -152,7 +152,7 @@
         }
 
         // Apply ReductionOperation only on first kernel
-        _memory_group.manage(_results_vector.get());
+        _memory_group.manage(&_results_vector[0]);
 
         ReductionOperation first_kernel_op;
         ReductionOperation intermediate_kernel_op;
@@ -183,30 +183,30 @@
                 ARM_COMPUTE_ERROR("Not supported");
         }
 
-        _reduction_kernels_vector[0].configure(input, _results_vector.get(), axis, first_kernel_op);
+        _reduction_kernels_vector[0].configure(input, &_results_vector[0], axis, first_kernel_op);
         _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
 
         // Apply ReductionOperation on intermediate stages
         for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
         {
-            _memory_group.manage(_results_vector.get() + i);
-            _reduction_kernels_vector[i].configure(_results_vector.get() + i - 1, _results_vector.get() + i, axis, intermediate_kernel_op);
-            _border_handlers_vector[i].configure(_results_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
+            _memory_group.manage(&_results_vector[i]);
+            _reduction_kernels_vector[i].configure(&_results_vector[i - 1], &_results_vector[i], axis, intermediate_kernel_op);
+            _border_handlers_vector[i].configure(&_results_vector[i - 1], _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
             _results_vector[i - 1].allocator()->allocate();
         }
 
         // Apply ReductionOperation on the last stage
         const unsigned int last_stage  = _num_of_stages - 1;
         const unsigned int input_width = input->info()->dimension(0);
-        _reduction_kernels_vector[last_stage].configure(_results_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
-        _border_handlers_vector[last_stage].configure(_results_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
+        _reduction_kernels_vector[last_stage].configure(&_results_vector[last_stage - 1], output, axis, last_kernel_op, input_width);
+        _border_handlers_vector[last_stage].configure(&_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
         _results_vector[last_stage - 1].allocator()->allocate();
     }
 }
 
 void CLReductionOperation::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     if(_is_serial)
     {
@@ -220,6 +220,4 @@
             CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
         }
     }
-
-    _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLSobel5x5.cpp b/src/runtime/CL/functions/CLSobel5x5.cpp
index d4bc855..22fbef1 100644
--- a/src/runtime/CL/functions/CLSobel5x5.cpp
+++ b/src/runtime/CL/functions/CLSobel5x5.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -81,10 +81,8 @@
 {
     CLScheduler::get().enqueue(_border_handler, false);
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     CLScheduler::get().enqueue(_sobel_hor, false);
     CLScheduler::get().enqueue(_sobel_vert);
-
-    _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLSobel7x7.cpp b/src/runtime/CL/functions/CLSobel7x7.cpp
index 6083090..9b38f69 100644
--- a/src/runtime/CL/functions/CLSobel7x7.cpp
+++ b/src/runtime/CL/functions/CLSobel7x7.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -81,10 +81,8 @@
 {
     CLScheduler::get().enqueue(_border_handler, false);
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     CLScheduler::get().enqueue(_sobel_hor, false);
     CLScheduler::get().enqueue(_sobel_vert);
-
-    _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index d671846..7e41dba 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -191,7 +191,7 @@
 
 void CLSoftmaxLayer::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     if(_needs_flattening)
     {
@@ -205,9 +205,6 @@
     {
         CLScheduler::get().enqueue(_reshape_kernel, true);
     }
-
-    // Relase intermediate buffers
-    _memory_group.release();
 }
 
 } // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLSplit.cpp b/src/runtime/CL/functions/CLSplit.cpp
index f084351..8d37d53 100644
--- a/src/runtime/CL/functions/CLSplit.cpp
+++ b/src/runtime/CL/functions/CLSplit.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,8 +42,8 @@
 void CLSplit::configure(const ICLTensor *input, const std::vector<ICLTensor *> &outputs, unsigned int axis)
 {
     // Create Slice functions
-    _num_outputs     = outputs.size();
-    _slice_functions = arm_compute::support::cpp14::make_unique<CLSlice[]>(_num_outputs);
+    _num_outputs = outputs.size();
+    _slice_functions.resize(_num_outputs);
 
     // Get output shape
     const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_split_shape(input->info(), axis, _num_outputs);

diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp
index 71327fe..2700b49 100644
--- a/src/runtime/CL/functions/CLStackLayer.cpp
+++ b/src/runtime/CL/functions/CLStackLayer.cpp

@@ -46,8 +46,8 @@
 
 void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
 {
-    _num_inputs    = input.size();
-    _stack_kernels = arm_compute::support::cpp14::make_unique<CLStackLayerKernel[]>(_num_inputs);
+    _num_inputs = input.size();
+    _stack_kernels.resize(_num_inputs);
 
     // Wrap around negative values
     const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));

diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp
index 428d091..eb1dd8c 100644
--- a/src/runtime/CL/functions/CLUnstack.cpp
+++ b/src/runtime/CL/functions/CLUnstack.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -74,7 +74,7 @@
     // Wrap around negative values
     const unsigned int axis_u = wrap_axis(axis, input->info());
     _num_slices               = std::min(outputs_vector_info.size(), input->info()->dimension(axis_u));
-    _strided_slice_vector     = arm_compute::support::cpp14::make_unique<CLStridedSlice[]>(_num_slices);
+    _strided_slice_vector.resize(_num_slices);
 
     Coordinates slice_start;
     int32_t     slice_end_mask;

diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
index d0801a6..a8667c3 100644
--- a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp

@@ -51,7 +51,7 @@
 
     // Output auto inizialitation if not yet initialized
     TensorInfo        tmp_output_info = *output->clone();
-    const TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+    const TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
     auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
 
     switch(num_inputs)
@@ -90,7 +90,7 @@
     {
         inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
     }
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
@@ -109,7 +109,7 @@
             break;
         default:
             // Configure generic case WidthConcatenate kernels
-            _concat_kernels_vector = arm_compute::support::cpp14::make_unique<CLWidthConcatenateLayerKernel[]>(_num_inputs);
+            _concat_kernels_vector.resize(_num_inputs);
 
             unsigned int width_offset = 0;
             for(unsigned int i = 0; i < _num_inputs; ++i)

diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 069196e..d3c3f98 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,6 +62,11 @@
         output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U,
                              kernel_dims.height == 1 ? 1U : 4U);
     }
+    else if(kernel_max_dim == 7U)
+    {
+        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U,
+                             kernel_dims.height == 1 ? 1U : 2U);
+    }
 
     return output_tile;
 }
@@ -73,7 +78,8 @@
 
     std::vector<WinogradConfiguration> fast_math_winograd =
     {
-        WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
+        WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
+        WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))
     };
 
     auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
@@ -198,7 +204,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run input transform
     _input_transform.run();
@@ -208,8 +214,6 @@
 
     // Run output transform
     CLScheduler::get().enqueue(_output_transform);
-
-    _memory_group.release();
 }
 
 void CLWinogradConvolutionLayer::prepare()

diff --git a/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp b/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp
deleted file mode 100644
index cd97849..0000000
--- a/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp
+++ /dev/null

@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.h"
-
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-namespace
-{
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_gemm_reshaped(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
-                                                                        bool lhs_interleave, bool rhs_interleave)
-{
-    GEMMLHSMatrixInfo lhs_info;
-    GEMMRHSMatrixInfo rhs_info;
-
-    // Configure GEMMLHSMatrixInfo
-    lhs_info.m0         = m0;
-    lhs_info.k0         = k0;
-    lhs_info.v0         = ((m / (lhs_info.m0 * v0)) == 0) ? 1 : v0;
-    lhs_info.interleave = lhs_interleave;
-    lhs_info.transpose  = false;
-
-    // Configure GEMMRHSMatrixInfo
-    rhs_info.n0         = n0;
-    rhs_info.k0         = lhs_info.k0;
-    rhs_info.h0         = ((n / (rhs_info.n0 * h0)) == 0) ? 1 : h0;
-    rhs_info.interleave = rhs_interleave;
-    rhs_info.transpose  = true;
-
-    return std::make_pair(lhs_info, rhs_info);
-}
-
-} // namespace
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
-{
-    ARM_COMPUTE_ERROR_ON(data_type != DataType::F32 && data_type != DataType::QASYMM8);
-    ARM_COMPUTE_UNUSED(data_type);
-
-    const GPUTarget gpu_target = CLScheduler::get().target();
-
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (CLGEMMReshapedConfigurationBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-
-    // Configurations for Mali-G76
-    static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_reshaped_configs_G76 =
-    {
-        { DataType::F32, &CLGEMMReshapedConfigurationBifrost::configure_G76_f32 },
-        { DataType::QASYMM8, &CLGEMMReshapedConfigurationBifrost::configure_G76_u8 }
-    };
-
-    // Configurations for Mali-G7x
-    static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_reshaped_configs_G7x =
-    {
-        { DataType::F32, &CLGEMMReshapedConfigurationBifrost::configure_G7x_f32 },
-        { DataType::QASYMM8, &CLGEMMReshapedConfigurationBifrost::configure_G7x_u8 }
-    };
-
-    switch(gpu_target)
-    {
-        case GPUTarget::G76:
-            return (this->*gemm_reshaped_configs_G76[data_type])(m, n, k, b);
-        default:
-            return (this->*gemm_reshaped_configs_G7x[data_type])(m, n, k, b);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(n <= 4)
-    {
-        return configure_gemm_reshaped(m, n, 4, 2, 8, 16, 16, true, false);
-    }
-    else
-    {
-        return configure_gemm_reshaped(m, n, 5, 4, 4, 2, 16, false, true);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
-    {
-        if(n <= 4)
-        {
-            return configure_gemm_reshaped(m, n, 4, 2, 16, 2, 2, true, false);
-        }
-        else
-        {
-            return configure_gemm_reshaped(m, n, 4, 4, 16, 2, 2, true, false);
-        }
-    }
-    else
-    {
-        if(n <= 4)
-        {
-            return configure_gemm_reshaped(m, n, 4, 2, 8, 2, 2, true, false);
-        }
-        else
-        {
-            return configure_gemm_reshaped(m, n, 6, 4, 4, 2, 2, true, true);
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(n <= 4)
-    {
-        return configure_gemm_reshaped(m, n, 4, 2, 8, 16, 16, true, false);
-    }
-    else
-    {
-        return configure_gemm_reshaped(m, n, 4, 4, 2, 8, 16, false, false);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(n <= 4)
-    {
-        return configure_gemm_reshaped(m, n, 4, 2, 16, 4, 1, false, false);
-    }
-    else
-    {
-        return configure_gemm_reshaped(m, n, 4, 4, 16, 2, 2, false, true);
-    }
-}
-} // namespace cl_gemm
-} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/CL/tuners/CLLWSList.cpp b/src/runtime/CL/tuners/CLLWSList.cpp
new file mode 100644
index 0000000..30fd558
--- /dev/null
+++ b/src/runtime/CL/tuners/CLLWSList.cpp

@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/tuners/CLLWSList.h"
+
+namespace arm_compute
+{
+namespace cl_tuner
+{
+size_t CLLWSList::size()
+{
+    return search_space_shape.total_size();
+}
+
+cl::NDRange CLLWSListExhaustive::operator[](size_t index)
+{
+    ARM_COMPUTE_ERROR_ON(index >= size());
+    auto coords = index2coords(search_space_shape, index);
+    return cl::NDRange{ coords[0] + 1U, coords[1] + 1U, coords[2] + 1U };
+}
+
+CLLWSListExhaustive::CLLWSListExhaustive(const cl::NDRange &gws)
+{
+    ARM_COMPUTE_UNUSED(gws);
+    search_space_shape = TensorShape(max_lws_supported_x,
+                                     max_lws_supported_y,
+                                     max_lws_supported_z);
+}
+
+cl::NDRange CLLWSListNormal::operator[](size_t index)
+{
+    ARM_COMPUTE_ERROR_ON(index >= size());
+    auto coords = index2coords(search_space_shape, index);
+    return cl::NDRange{ _lws_x[coords[0]], _lws_y[coords[1]], _lws_z[coords[2]] };
+}
+
+CLLWSListNormal::CLLWSListNormal(const cl::NDRange &gws)
+{
+    auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x);
+    auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y);
+    auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z);
+
+    // Initialize the LWS values to test
+    initialize_lws_values(_lws_x, gws[0], lws_x_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
+    initialize_lws_values(_lws_y, gws[1], lws_y_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
+    initialize_lws_values(_lws_z, gws[2], lws_z_max, false);
+
+    search_space_shape = TensorShape(_lws_x.size(), _lws_y.size(), _lws_z.size());
+}
+
+void CLLWSListNormal::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one)
+{
+    lws.push_back(1);
+
+    for(unsigned int i = 2; i <= lws_max; ++i)
+    {
+        // Power of two condition
+        const bool is_power_of_two = (i & (i - 1)) == 0;
+
+        // Condition for the module accordingly with the mod_let_one flag
+        const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0;
+
+        if(mod_cond || is_power_of_two)
+        {
+            lws.push_back(i);
+        }
+    }
+}
+
+CLLWSListRapid::CLLWSListRapid(const cl::NDRange &gws)
+{
+    auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 8u); // Limit exploration to 1 - 8
+    auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 4u); // Limit exploration to 1 - 4
+    auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 4u); // Limit exploration to 1 - 4
+
+    // Initialize the LWS values to test
+    initialize_lws_values(_lws_x, lws_x_max);
+    initialize_lws_values(_lws_y, lws_y_max);
+    initialize_lws_values(_lws_z, lws_z_max);
+
+    search_space_shape = TensorShape(_lws_x.size(), _lws_y.size(), _lws_z.size());
+}
+
+void CLLWSListRapid::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int lws_max)
+{
+    lws.push_back(1);
+
+    for(unsigned int i = 2; i <= lws_max; i *= 4)
+    {
+        lws.push_back(i);
+    }
+}
+} // namespace cl_tuner
+} // namespace arm_compute

diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
index 79e619c..9a141cb 100644
--- a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp

@@ -600,7 +600,7 @@
         if(_info.keep_top_k() > -1 && num_det > _info.keep_top_k())
         {
             std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-            for(auto it : indices)
+            for(auto const &it : indices)
             {
                 const int               label         = it.first;
                 const std::vector<int> &label_indices = it.second;
@@ -614,7 +614,7 @@
                 for(auto idx : label_indices)
                 {
                     ARM_COMPUTE_ERROR_ON(idx > static_cast<int>(scores.size()));
-                    score_index_pairs.push_back(std::make_pair(scores[idx], std::make_pair(label, idx)));
+                    score_index_pairs.emplace_back(std::make_pair(scores[idx], std::make_pair(label, idx)));
                 }
             }
 

diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp
index f3355a7..f7240db 100644
--- a/src/runtime/CPUUtils.cpp
+++ b/src/runtime/CPUUtils.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,16 +54,16 @@
 /* Make sure the bits we care about are defined, just in case asm/hwcap.h is
  * out of date (or for bare metal mode) */
 #ifndef HWCAP_ASIMDHP
-#define HWCAP_ASIMDHP (1 << 10)
-#endif /* HWCAP_ASIMDHP */
+#define HWCAP_ASIMDHP (1 << 10) // NOLINT
+#endif                          /* HWCAP_ASIMDHP */
 
 #ifndef HWCAP_CPUID
-#define HWCAP_CPUID (1 << 11)
-#endif /* HWCAP_CPUID */
+#define HWCAP_CPUID (1 << 11) // NOLINT
+#endif                        /* HWCAP_CPUID */
 
 #ifndef HWCAP_ASIMDDP
-#define HWCAP_ASIMDDP (1 << 20)
-#endif /* HWCAP_ASIMDDP */
+#define HWCAP_ASIMDDP (1 << 20) // NOLINT
+#endif                          /* HWCAP_ASIMDDP */
 
 namespace
 {
@@ -146,12 +146,12 @@
                 break;
         }
     }
-    else if(implementer == 0x48) // HiSilicon CPUs
+    else if(implementer == 0x48)
     {
         // Only CPUs we have code paths for are detected.  All other CPUs can be safely classed as "GENERIC"
         switch(cpunum)
         {
-            case 0xd40: // A76 (Kirin 980)
+            case 0xd40: // A76
                 model = CPUModel::GENERIC_FP16_DOT;
                 break;
             default:
@@ -220,8 +220,8 @@
 
         while(bool(getline(file, line)))
         {
-            regmatch_t match[2];
-            ret_status = regexec(&proc_regex, line.c_str(), 2, match, 0);
+            std::array<regmatch_t, 2> match;
+            ret_status = regexec(&proc_regex, line.c_str(), 2, match.data(), 0);
             if(ret_status == 0)
             {
                 std::string id     = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
@@ -244,7 +244,7 @@
                 continue;
             }
 
-            ret_status = regexec(&imp_regex, line.c_str(), 2, match, 0);
+            ret_status = regexec(&imp_regex, line.c_str(), 2, match.data(), 0);
             if(ret_status == 0)
             {
                 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
@@ -254,7 +254,7 @@
                 continue;
             }
 
-            ret_status = regexec(&var_regex, line.c_str(), 2, match, 0);
+            ret_status = regexec(&var_regex, line.c_str(), 2, match.data(), 0);
             if(ret_status == 0)
             {
                 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
@@ -264,7 +264,7 @@
                 continue;
             }
 
-            ret_status = regexec(&part_regex, line.c_str(), 2, match, 0);
+            ret_status = regexec(&part_regex, line.c_str(), 2, match.data(), 0);
             if(ret_status == 0)
             {
                 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
@@ -274,7 +274,7 @@
                 continue;
             }
 
-            ret_status = regexec(&rev_regex, line.c_str(), 2, match, 0);
+            ret_status = regexec(&rev_regex, line.c_str(), 2, match.data(), 0);
             if(ret_status == 0)
             {
                 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
@@ -302,8 +302,7 @@
 
 int get_max_cpus()
 {
-    int max_cpus = 1;
-#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
+    int           max_cpus = 1;
     std::ifstream CPUspresent;
     CPUspresent.open("/sys/devices/system/cpu/present", std::ios::in);
     bool success = false;
@@ -341,7 +340,6 @@
     {
         max_cpus = std::thread::hardware_concurrency();
     }
-#endif /* BARE_METAL */
     return max_cpus;
 }
 #endif /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
@@ -427,8 +425,8 @@
         std::string line;
         while(bool(getline(cpuinfo, line)))
         {
-            regmatch_t match[2];
-            ret_status = regexec(&cpu_part_rgx, line.c_str(), 2, match, 0);
+            std::array<regmatch_t, 2> match;
+            ret_status = regexec(&cpu_part_rgx, line.c_str(), 2, match.data(), 0);
             if(ret_status == 0)
             {
                 std::string cpu_part = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));

diff --git a/src/runtime/Distribution1D.cpp b/src/runtime/Distribution1D.cpp
index 3431834..9e6fce4 100644
--- a/src/runtime/Distribution1D.cpp
+++ b/src/runtime/Distribution1D.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,12 +31,11 @@
 using namespace arm_compute;
 
 Distribution1D::Distribution1D(size_t num_bins, int32_t offset, uint32_t range)
-    : IDistribution1D(num_bins, offset, range), _data(arm_compute::support::cpp14::make_unique<uint32_t[]>(num_bins))
+    : IDistribution1D(num_bins, offset, range), _data(num_bins)
 {
 }
 
 uint32_t *Distribution1D::buffer() const
 {
-    ARM_COMPUTE_ERROR_ON(nullptr == _data);
-    return _data.get();
+    return _data.data();
 }

diff --git a/src/runtime/GLES_COMPUTE/GCMemory.cpp b/src/runtime/GLES_COMPUTE/GCMemory.cpp
index fed4a15..f1457c4 100644
--- a/src/runtime/GLES_COMPUTE/GCMemory.cpp
+++ b/src/runtime/GLES_COMPUTE/GCMemory.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,8 +33,8 @@
 {
 }
 
-GCMemory::GCMemory(std::shared_ptr<IGCMemoryRegion> memory)
-    : _region(nullptr), _region_owned(std::move(memory))
+GCMemory::GCMemory(const std::shared_ptr<IGCMemoryRegion> &memory)
+    : _region(nullptr), _region_owned(memory)
 {
     _region_owned = memory;
     _region       = _region_owned.get();

diff --git a/src/runtime/GLES_COMPUTE/GCScheduler.cpp b/src/runtime/GLES_COMPUTE/GCScheduler.cpp
index f781273..6a39e7c 100644
--- a/src/runtime/GLES_COMPUTE/GCScheduler.cpp
+++ b/src/runtime/GLES_COMPUTE/GCScheduler.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -97,7 +97,7 @@
     ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_surfaceless_context") == nullptr), "Failed to query EGL_KHR_surfaceless_context");
     ARM_COMPUTE_UNUSED(egl_extension_st);
 
-    const EGLint config_attribs[] =
+    const std::array<EGLint, 3> config_attribs =
     {
         EGL_RENDERABLE_TYPE, EGL_OPENGL_ES3_BIT_KHR,
         EGL_NONE
@@ -105,7 +105,7 @@
     EGLConfig cfg;
     EGLint    count;
 
-    res = eglChooseConfig(_display, config_attribs, &cfg, 1, &count);
+    res = eglChooseConfig(_display, config_attribs.data(), &cfg, 1, &count);
 
     ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to choose config: 0x%x.", eglGetError());
     ARM_COMPUTE_UNUSED(res);
@@ -114,7 +114,7 @@
 
     ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to bind api: 0x%x.", eglGetError());
 
-    const EGLint attribs[] =
+    const std::array<EGLint, 3> attribs =
     {
         EGL_CONTEXT_CLIENT_VERSION, 3,
         EGL_NONE
@@ -122,7 +122,7 @@
     _context = eglCreateContext(_display,
                                 cfg,
                                 EGL_NO_CONTEXT,
-                                attribs);
+                                attribs.data());
 
     ARM_COMPUTE_ERROR_ON_MSG(_context == EGL_NO_CONTEXT, "Failed to create context: 0x%x.", eglGetError());
     ARM_COMPUTE_UNUSED(res);

diff --git a/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp
new file mode 100644
index 0000000..506f648
--- /dev/null
+++ b/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp

@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+GCConcatenateLayer::GCConcatenateLayer()
+    : _concat_kernels(),
+      _num_inputs(0),
+      _axis(Window::DimZ)
+{
+}
+
+void GCConcatenateLayer::configure(std::vector<IGCTensor *> inputs_vector, IGCTensor *output, size_t axis)
+{
+    ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
+
+    _num_inputs = inputs_vector.size();
+    _axis       = axis;
+
+    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, axis);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
+
+    unsigned int offset = 0;
+    switch(axis)
+    {
+        case Window::DimZ:
+        {
+            for(unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                auto kernel = support::cpp14::make_unique<GCDepthConcatenateLayerKernel>();
+                kernel->configure(inputs_vector.at(i), offset, output);
+                offset += inputs_vector.at(i)->info()->dimension(axis);
+                _concat_kernels.emplace_back(std::move(kernel));
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Axis not supported");
+    }
+}
+
+void GCConcatenateLayer::run()
+{
+    for(auto &kernel : _concat_kernels)
+    {
+        GCScheduler::get().dispatch(*kernel, true);
+    }
+}
+} // namespace arm_compute

diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
index a35a18a..61c0740 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp

@@ -201,7 +201,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run im2col
     GCScheduler::get().dispatch(_fill_border);
@@ -216,8 +216,6 @@
     GCScheduler::get().dispatch(_output_col2im_kernel, false);
     GCScheduler::get().memory_barrier();
 
-    _memory_group.release();
-
     // Run Activation Layer
     if(_is_activationlayer_enabled)
     {

diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
index aa937a6..b89aafa 100755
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp

@@ -47,13 +47,18 @@
 
     unsigned int depth_offset = 0;
 
-    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<GCDepthConcatenateLayerKernel[]>(_num_inputs);
-    _border_handlers_vector = arm_compute::support::cpp14::make_unique<GCFillBorderKernel[]>(_num_inputs);
+    _concat_kernels_vector.reserve(_num_inputs);
+    _border_handlers_vector.reserve(_num_inputs);
 
     for(unsigned int i = 0; i < _num_inputs; i++)
     {
-        _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
-        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue());
+        auto concat_kernel = support::cpp14::make_unique<GCDepthConcatenateLayerKernel>();
+        auto border_kernel = support::cpp14::make_unique<GCFillBorderKernel>();
+
+        concat_kernel->configure(inputs_vector.at(i), depth_offset, output);
+        border_kernel->configure(inputs_vector.at(i), concat_kernel->border_size(), BorderMode::CONSTANT, PixelValue());
+        _concat_kernels_vector.emplace_back(std::move(concat_kernel));
+        _border_handlers_vector.emplace_back(std::move(border_kernel));
 
         depth_offset += inputs_vector.at(i)->info()->dimension(2);
     }
@@ -63,8 +68,8 @@
 {
     for(unsigned i = 0; i < _num_inputs; i++)
     {
-        GCScheduler::get().dispatch(_border_handlers_vector[i], false);
+        GCScheduler::get().dispatch(*_border_handlers_vector[i].get(), false);
         GCScheduler::get().memory_barrier();
-        GCScheduler::get().dispatch(_concat_kernels_vector[i], true);
+        GCScheduler::get().dispatch(*_concat_kernels_vector[i].get(), true);
     }
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
index ba05838..0f772bd 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp

@@ -36,8 +36,10 @@
 }
 
 void GCDepthwiseConvolutionLayer3x3::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info,
-                                               unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+                                               unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
+    ARM_COMPUTE_ERROR_ON(dilation.x() != 1 || dilation.y() != 1);
+    ARM_COMPUTE_UNUSED(dilation);
     auto k = arm_compute::support::cpp14::make_unique<GCDepthwiseConvolutionLayer3x3Kernel>();
     k->configure(input, weights, biases, output, conv_info, depth_multiplier);
     _kernel = std::move(k);

diff --git a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
index 6b8e341..a208545 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -150,7 +150,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Linearize input if it comes from a convolutional layer
     if(_is_fc_after_conv)
@@ -173,8 +173,6 @@
 
         GCScheduler::get().dispatch(_accumulate_biases_kernel);
     }
-
-    _memory_group.release();
 }
 
 void GCFullyConnectedLayer::prepare()

diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
index 8ae91ee..ddfe590 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -162,7 +162,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     if(_is_interleaved_transposed)
     {
@@ -187,8 +187,6 @@
         GCScheduler::get().memory_barrier();
         GCScheduler::get().dispatch(_ma_kernel);
     }
-
-    _memory_group.release();
 }
 
 void GCGEMM::prepare()

diff --git a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
index 2569365..8f60279 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp

@@ -56,13 +56,11 @@
 
 void GCNormalizationLayer::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     GCScheduler::get().dispatch(_multiply_kernel, false);
     GCScheduler::get().memory_barrier();
     GCScheduler::get().dispatch(_border_handler, false);
     GCScheduler::get().memory_barrier();
     GCScheduler::get().dispatch(_norm_kernel, true);
-
-    _memory_group.release();
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
index dad42cd..0645ae7 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -69,13 +69,11 @@
 
 void GCSoftmaxLayer::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     GCScheduler::get().dispatch(_max_kernel, false);
     GCScheduler::get().memory_barrier();
     GCScheduler::get().dispatch(_shift_exp_sum_kernel, false);
     GCScheduler::get().memory_barrier();
     GCScheduler::get().dispatch(_norm_kernel);
-
-    _memory_group.release();
 }

diff --git a/src/runtime/HOG.cpp b/src/runtime/HOG.cpp
index 01640bb..e9f38c4 100644
--- a/src/runtime/HOG.cpp
+++ b/src/runtime/HOG.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,20 +29,19 @@
 using namespace arm_compute;
 
 HOG::HOG()
-    : IHOG(), _info(), _descriptor(nullptr)
+    : IHOG(), _info(), _descriptor()
 {
 }
 
 void HOG::init(const HOGInfo &input)
 {
-    ARM_COMPUTE_ERROR_ON(nullptr != _descriptor);
-    _info       = input;
-    _descriptor = arm_compute::support::cpp14::make_unique<float[]>(_info.descriptor_size());
+    _info = input;
+    _descriptor.resize(_info.descriptor_size());
 }
 
 float *HOG::descriptor() const
 {
-    return _descriptor.get();
+    return _descriptor.data();
 }
 
 const HOGInfo *HOG::info() const

diff --git a/src/runtime/LutAllocator.cpp b/src/runtime/LutAllocator.cpp
index eb9051c..0db5217 100644
--- a/src/runtime/LutAllocator.cpp
+++ b/src/runtime/LutAllocator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,23 +28,23 @@
 using namespace arm_compute;
 
 LutAllocator::LutAllocator()
-    : _buffer(nullptr)
+    : _buffer()
 {
 }
 
 uint8_t *LutAllocator::data() const
 {
-    return _buffer.get();
+    return _buffer.data();
 }
 
 void LutAllocator::allocate()
 {
-    _buffer = arm_compute::support::cpp14::make_unique<uint8_t[]>(size());
+    _buffer.resize(size());
 }
 
 uint8_t *LutAllocator::lock()
 {
-    return _buffer.get();
+    return _buffer.data();
 }
 
 void LutAllocator::unlock()

diff --git a/src/runtime/Memory.cpp b/src/runtime/Memory.cpp
index d116624..c6b956d 100644
--- a/src/runtime/Memory.cpp
+++ b/src/runtime/Memory.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,8 +32,8 @@
 {
 }
 
-Memory::Memory(std::shared_ptr<IMemoryRegion> memory)
-    : _region(nullptr), _region_owned(std::move(memory))
+Memory::Memory(const std::shared_ptr<IMemoryRegion> &memory)
+    : _region(nullptr), _region_owned(memory)
 {
     _region_owned = memory;
     _region       = _region_owned.get();

diff --git a/src/runtime/MultiHOG.cpp b/src/runtime/MultiHOG.cpp
index e0b60b1..154bbd7 100644
--- a/src/runtime/MultiHOG.cpp
+++ b/src/runtime/MultiHOG.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,8 +30,9 @@
 using namespace arm_compute;
 
 MultiHOG::MultiHOG(size_t num_models)
-    : _num_models(num_models), _model(arm_compute::support::cpp14::make_unique<HOG[]>(_num_models))
+    : _num_models(num_models), _model()
 {
+    _model.resize(_num_models);
 }
 
 size_t MultiHOG::num_models() const
@@ -42,11 +43,11 @@
 IHOG *MultiHOG::model(size_t index)
 {
     ARM_COMPUTE_ERROR_ON(index >= _num_models);
-    return (_model.get() + index);
+    return (&_model[index]);
 }
 
 const IHOG *MultiHOG::model(size_t index) const
 {
     ARM_COMPUTE_ERROR_ON(index >= _num_models);
-    return (_model.get() + index);
+    return (&_model[index]);
 }

diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
index d33e134..6863bb0 100644
--- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp

@@ -57,15 +57,13 @@
 
 void NEArgMinMaxLayer::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     if(_run_fill_border)
     {
         NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
     }
     NEScheduler::get().schedule(&_reduction_kernel, Window::DimY);
-
-    _memory_group.release();
 }
 
 } // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
new file mode 100644
index 0000000..a4db1fd
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp

@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+namespace arm_compute
+{
+void NEBatchToSpaceLayer::configure(const ITensor *input, const ITensor *block_shape, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEBatchToSpaceLayerKernel>();
+    k->configure(input, block_shape, output);
+    _kernel = std::move(k);
+}
+
+void NEBatchToSpaceLayer::configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEBatchToSpaceLayerKernel>();
+    k->configure(input, block_shape_x, block_shape_y, output);
+    _kernel = std::move(k);
+}
+
+Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+{
+    return NEBatchToSpaceLayerKernel::validate(input, block_shape, output);
+}
+
+Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output)
+{
+    return NEBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index 0e5d50f..032e617 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -162,7 +162,7 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run sobelNxN
     _sobel->run();
@@ -184,6 +184,4 @@
 
     // Run edge tracing
     NEScheduler::get().schedule(&_edge_trace, Window::DimY);
-
-    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
index 21ab47d..71af560 100644
--- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,9 @@
 #include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEWidthConcatenateLayer.h"
 
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -35,56 +38,111 @@
 namespace arm_compute
 {
 NEConcatenateLayer::NEConcatenateLayer()
-    : _concat_function(nullptr)
+    : _concat_kernels(),
+      _num_inputs(0),
+      _axis(Window::DimX)
 {
 }
 
-void NEConcatenateLayer::configure(const std::vector<ITensor *> &inputs_vector, ITensor *output, DataLayoutDimension axis)
+void NEConcatenateLayer::configure(const std::vector<ITensor *> &inputs_vector, ITensor *output, size_t axis)
 {
     ARM_COMPUTE_ERROR_ON(output == nullptr);
+    _axis       = axis;
+    _num_inputs = inputs_vector.size();
 
-    switch(get_data_layout_dimension_index(output->info()->data_layout(), axis))
+    std::vector<ITensorInfo *> inputs_vector_info;
+    inputs_vector_info.reserve(_num_inputs);
+    for(unsigned int i = 0; i < _num_inputs; ++i)
     {
-        case 0:
+        ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i));
+        inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
+    }
+    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, _axis);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
+    ARM_COMPUTE_ERROR_THROW_ON(NEConcatenateLayer::validate(inputs_vector_info, output->info(), axis));
+
+    unsigned int offset = 0;
+
+    for(unsigned int i = 0; i < _num_inputs; ++i)
+    {
+        switch(_axis)
         {
-            auto func = support::cpp14::make_unique<NEWidthConcatenateLayer>();
-            func->configure(inputs_vector, output);
-            _concat_function = std::move(func);
-            break;
+            case Window::DimX:
+            {
+                auto kernel = support::cpp14::make_unique<NEWidthConcatenateLayerKernel>();
+                kernel->configure(inputs_vector.at(i), offset, output);
+                _concat_kernels.emplace_back(std::move(kernel));
+                break;
+            }
+            case Window::DimY:
+            {
+                auto kernel = support::cpp14::make_unique<NEHeightConcatenateLayerKernel>();
+                kernel->configure(inputs_vector.at(i), offset, output);
+                _concat_kernels.emplace_back(std::move(kernel));
+                break;
+            }
+            case Window::DimZ:
+            {
+                auto kernel = support::cpp14::make_unique<NEDepthConcatenateLayerKernel>();
+                kernel->configure(inputs_vector.at(i), offset, output);
+                _concat_kernels.emplace_back(std::move(kernel));
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("Axis not supported");
         }
-        case 2:
-        {
-            auto func = support::cpp14::make_unique<NEDepthConcatenateLayer>();
-            func->configure(inputs_vector, output);
-            _concat_function = std::move(func);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Concatenation is supported across width and depth only!");
+        offset += inputs_vector.at(i)->info()->dimension(_axis);
     }
 }
 
-Status NEConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, DataLayoutDimension axis)
+Status NEConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON(output == nullptr);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
 
-    switch(get_data_layout_dimension_index(output->data_layout(), axis))
+    unsigned int offset = 0;
+    for(const auto &input : inputs_vector)
     {
-        case 0:
-            ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(inputs_vector, output));
-            break;
-        case 2:
-            ARM_COMPUTE_RETURN_ON_ERROR(NEDepthConcatenateLayer::validate(inputs_vector, output));
-            break;
-        default:
-            ARM_COMPUTE_RETURN_ERROR_MSG("Concatenation is supported across width and depth only!");
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+        switch(axis)
+        {
+            case Window::DimX:
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayerKernel::validate(input, offset, output));
+                break;
+            }
+            case Window::DimY:
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(NEHeightConcatenateLayerKernel::validate(input, offset, output));
+                break;
+            }
+            case Window::DimZ:
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(NEDepthConcatenateLayerKernel::validate(input, offset, output));
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("Axis not supported");
+        }
+        offset += input->dimension(axis);
     }
+
+    if(output->total_size() != 0)
+    {
+        TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, axis);
+        ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+    }
+
     return Status{};
 }
 
 void NEConcatenateLayer::run()
 {
-    ARM_COMPUTE_ERROR_ON(_concat_function == nullptr);
-    _concat_function->run();
+    for(auto &kernel : _concat_kernels)
+    {
+        NEScheduler::get().schedule(kernel.get(), _axis);
+    }
 }
 } // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp
index b84dfd3..973855e 100644
--- a/src/runtime/NEON/functions/NEConvolution.cpp
+++ b/src/runtime/NEON/functions/NEConvolution.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -103,12 +103,10 @@
 
     if(_is_separable)
     {
-        _memory_group.acquire();
+        MemoryGroupResourceScope scope_mg(_memory_group);
 
         NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
         NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
-
-        _memory_group.release();
     }
     else
     {

diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 5059162..a62459b 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp

@@ -73,6 +73,13 @@
             _function = std::move(f);
             break;
         }
+        case ConvolutionMethod::FFT:
+        {
+            auto f = arm_compute::support::cpp14::make_unique<NEFFTConvolutionLayer>(_memory_manager);
+            f->configure(input, weights, biases, output, conv_info, act_info);
+            _function = std::move(f);
+            break;
+        }
         default:
             ARM_COMPUTE_ERROR("Not supported.");
             break;
@@ -97,6 +104,10 @@
         case ConvolutionMethod::DIRECT:
             //Validate Gemm-based Convolution
             ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
+        case ConvolutionMethod::FFT:
+            // Validate FFT-based convolution layer
+            ARM_COMPUTE_RETURN_ON_ERROR(NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info));
+            break;
         default:
             ARM_COMPUTE_ERROR("Not supported.");
             break;
@@ -148,12 +159,22 @@
         return (*found).second;
     }
 
-    if(dilation != Size2D(1U, 1U) || input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) <= 16)
+    if(dilation != Size2D(1U, 1U))
     {
         return ConvolutionMethod::GEMM;
     }
-
-    return bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
+    else
+    {
+        if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) > output->dimension(idx_c)) && (NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
+        {
+            return ConvolutionMethod::FFT;
+        }
+        if(input->dimension(idx_c) < 16)
+        {
+            return ConvolutionMethod::GEMM;
+        }
+        return bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
+    }
 }
 
 void NEConvolutionLayer::run()

diff --git a/src/runtime/NEON/functions/NECropResize.cpp b/src/runtime/NEON/functions/NECropResize.cpp
new file mode 100644
index 0000000..cc39d02
--- /dev/null
+++ b/src/runtime/NEON/functions/NECropResize.cpp

@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "arm_compute/runtime/NEON/functions/NECropResize.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+NECropResize::NECropResize()
+    : _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _crop(), _scale(), _crop_results(), _scaled_results()
+{
+}
+
+Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes, const ITensorInfo *box_ind, const ITensorInfo *output,
+                              Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA);
+    TensorInfo temp_info;
+    ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(), box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1, extrapolation_value));
+    if(output->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+        TensorShape out_shape(input->tensor_shape()[0], crop_size.x, crop_size.y, boxes->tensor_shape()[1]);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), out_shape);
+    }
+    return Status{};
+}
+
+void NECropResize::configure(const ITensor *input, const ITensor *boxes, const ITensor *box_ind, ITensor *output, Coordinates2D crop_size,
+                             InterpolationPolicy method, float extrapolation_value)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value));
+
+    _num_boxes = boxes->info()->tensor_shape()[1];
+    TensorShape out_shape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y);
+
+    _output              = output;
+    _method              = method;
+    _extrapolation_value = extrapolation_value;
+
+    // For each crop box:
+    // - A crop kernel is used to extract the initial cropped image as specified by boxes[i] from the 3D image input[box_ind[i]].
+    // - A tensor is required to hold this initial cropped image.
+    // - A scale function is used to resize the cropped image to the size specified by crop_size.
+    // - A tensor is required to hold the final scaled image before it is copied into the 4D output
+    //   that will hold all final cropped and scaled 3D images.
+    _crop.reserve(_num_boxes);
+    _crop_results.reserve(_num_boxes);
+    _scaled_results.reserve(_num_boxes);
+    _scale.reserve(_num_boxes);
+
+    for(unsigned int i = 0; i < _num_boxes; ++i)
+    {
+        auto       crop_tensor = support::cpp14::make_unique<Tensor>();
+        TensorInfo crop_result_info(1, DataType::F32);
+        crop_result_info.set_data_layout(DataLayout::NHWC);
+        crop_tensor->allocator()->init(crop_result_info);
+
+        auto       scale_tensor = support::cpp14::make_unique<Tensor>();
+        TensorInfo scaled_result_info(out_shape, 1, DataType::F32);
+        scaled_result_info.set_data_layout(DataLayout::NHWC);
+        scale_tensor->allocator()->init(scaled_result_info);
+
+        auto crop_kernel  = support::cpp14::make_unique<NECropKernel>();
+        auto scale_kernel = support::cpp14::make_unique<NEScale>();
+        crop_kernel->configure(input, boxes, box_ind, crop_tensor.get(), i, _extrapolation_value);
+
+        _crop.emplace_back(std::move(crop_kernel));
+        _scaled_results.emplace_back(std::move(scale_tensor));
+        _crop_results.emplace_back(std::move(crop_tensor));
+        _scale.emplace_back(std::move(scale_kernel));
+    }
+}
+
+void NECropResize::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
+
+    for(unsigned int i = 0; i < _num_boxes; ++i)
+    {
+        // Size of the crop box in _boxes and thus the shape of _crop_results[i]
+        // may not be known until run-time and so the kernels cannot be configured until then.
+        _crop[i]->configure_output_shape();
+        _crop_results[i]->allocator()->allocate();
+        NEScheduler::get().schedule(_crop[i].get(), Window::DimZ);
+
+        // Scale the cropped image.
+        _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(), _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT, false);
+        _scaled_results[i]->allocator()->allocate();
+        _scale[i]->run();
+
+        // Copy scaled image into output.
+        std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(), _output->ptr_to_element(Coordinates(0, 0, 0, i)));
+    }
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 44d7197..aff335e 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp

@@ -51,8 +51,8 @@
                                       unsigned int inner_border_right, unsigned int inner_border_top)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32, DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->dimension(1));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
@@ -68,7 +68,11 @@
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
 
-    if(bias != nullptr)
+    if(is_data_type_quantized_asymmetric(input->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+    }
+    else
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
     }
@@ -111,10 +115,11 @@
     _inner_border     = std::make_pair(inner_border_right, inner_border_top);
     _is_prepared      = false;
 
-    const unsigned int stride_x = info.stride().first;
-    const unsigned int stride_y = info.stride().second;
+    const DataLayout   data_layout = input->info()->data_layout();
+    const unsigned int stride_x    = info.stride().first;
+    const unsigned int stride_y    = info.stride().second;
 
-    _weights_flipped.allocator()->init(TensorInfo(weights->info()->tensor_shape(), 1, weights->info()->data_type()));
+    _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
     _flip_weights.configure(weights, &_weights_flipped);
 
     auto out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
@@ -159,12 +164,10 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     _upsample_f.run();
     _conv_f.run();
-
-    _memory_group.release();
 }
 
 void NEDeconvolutionLayer::prepare()

diff --git a/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
index 49db855..8f070a2 100644
--- a/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,26 +45,30 @@
 
 void NEDepthConcatenateLayer::configure(const std::vector<ITensor *> &inputs_vector, ITensor *output) // NOLINT
 {
-    _num_inputs             = inputs_vector.size();
-    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<NEDepthConcatenateLayerKernel[]>(_num_inputs);
-    _border_handlers_vector = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
+    _num_inputs = inputs_vector.size();
 
     std::vector<ITensorInfo *> inputs_vector_info;
     for(unsigned int i = 0; i < _num_inputs; i++)
     {
         inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
     }
-    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(inputs_vector_info);
+    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector_info, Window::DimZ);
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
     ARM_COMPUTE_ERROR_THROW_ON(NEDepthConcatenateLayer::validate(inputs_vector_info, output->info()));
 
     unsigned int depth_offset = 0;
+    _concat_kernels_vector.reserve(_num_inputs);
+    _border_handlers_vector.reserve(_num_inputs);
     for(unsigned int i = 0; i < _num_inputs; ++i)
     {
-        _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
-        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+        auto concat_kernel = support::cpp14::make_unique<NEDepthConcatenateLayerKernel>();
+        auto border_kernel = support::cpp14::make_unique<NEFillBorderKernel>();
+        concat_kernel->configure(inputs_vector.at(i), depth_offset, output);
+        border_kernel->configure(inputs_vector.at(i), concat_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+        _border_handlers_vector.emplace_back(std::move(border_kernel));
+        _concat_kernels_vector.emplace_back(std::move(concat_kernel));
 
         depth_offset += inputs_vector.at(i)->info()->dimension(2);
     }
@@ -80,7 +84,7 @@
 
     // Output auto inizialitation if not yet initialized
     TensorInfo  tmp_output_info = *output->clone();
-    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(inputs_vector);
+    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimZ);
     auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
 
     unsigned int depth_offset = 0;
@@ -98,7 +102,7 @@
 {
     for(unsigned i = 0; i < _num_inputs; ++i)
     {
-        NEScheduler::get().schedule(&_border_handlers_vector[i], Window::DimX);
-        NEScheduler::get().schedule(&_concat_kernels_vector[i], Window::DimX);
+        NEScheduler::get().schedule(_border_handlers_vector[i].get(), Window::DimX);
+        NEScheduler::get().schedule(_concat_kernels_vector[i].get(), Window::DimX);
     }
 }

diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index f0fd4cf..3bb69b1 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp

@@ -31,112 +31,79 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
+#include "arm_compute/core/utils/misc/InfoHelpers.h"
+
 using namespace arm_compute::misc;
 using namespace arm_compute::misc::shape_calculator;
 
-NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
-    : _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _accumulator(), _permuted_input(),
-      _permuted_weights(), _permuted_output(), _has_bias(false), _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false), _is_nchw(true), _is_first_run(true), _permute(false),
-      _is_activationlayer_enabled(false)
+namespace arm_compute
+{
+NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager), _dwc_kernel(), _dwc_optimized_func(memory_manager), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(),
+      _activationlayer_function(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_optimized(false),
+      _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
 {
 }
 
-void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                               unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+void NEDepthwiseConvolutionLayer3x3::configure_generic(ITensor                   *input,
+                                                       const ITensor             *weights,
+                                                       const ITensor             *biases,
+                                                       ITensor                   *output,
+                                                       const PadStrideInfo       &conv_info,
+                                                       unsigned int               depth_multiplier,
+                                                       const ActivationLayerInfo &act_info,
+                                                       const Size2D              &dilation)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_UNUSED(act_info);
 
     PixelValue zero_value(0.f);
 
-    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _has_bias     = biases != nullptr;
-    _is_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->info()->tensor_shape(),
-                                                                                          conv_info,
-                                                                                          input->info()->data_type(),
-                                                                                          depth_multiplier,
-                                                                                          input->info()->data_layout());
-    _are_weights_reshaped = false;
-    _is_nchw              = input->info()->data_layout() == DataLayout::NCHW;
-    _permute              = _is_optimized == _is_nchw;
-
     // Initialize the intermediate accumulator tensor in case of quantized input
     if(_is_quantized)
     {
         TensorShape accum_shape  = output->info()->tensor_shape();
         DataLayout  accum_layout = output->info()->data_layout();
-        if(!_is_optimized && !_is_nchw)
+        if(!_is_nchw)
         {
             permute(accum_shape, PermutationVector(1U, 2U, 0U));
             accum_layout = DataLayout::NCHW;
         }
 
+        _memory_group.manage(&_accumulator);
         _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info()));
         _accumulator.info()->set_data_layout(accum_layout);
         zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
     }
 
-    if(_is_optimized)
+    if(!_is_nchw)
     {
-        ITensor *optimized_output = (_is_quantized) ? &_accumulator : output;
-        if(_is_nchw)
-        {
-            // Configure the function to transform the input tensor from NCHW -> NHWC
-            _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
-            _permuted_input.info()->set_data_layout(DataLayout::NHWC);
+        _memory_group.manage(&_permuted_input);
+        _memory_group.manage(&_permuted_output);
 
-            // Configure the function to transform the weights tensor from IHW -> HWI
-            _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
-            _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+        // Configure the function to transform the input tensor from NHWC -> NCHW
+        _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+        _permuted_input.info()->set_data_layout(DataLayout::NCHW);
 
-            // Configure optimized depthwise
-            _dwc_kernel.configure(&_permuted_input, &_permuted_weights, &_permuted_output, conv_info, depth_multiplier, DataLayout::NHWC);
+        // Configure the function to transform the weights tensor from HWI -> IHW
+        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+        _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
 
-            // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
-            _permuted_output.info()->set_data_layout(DataLayout::NHWC);
-            _permute_output.configure(&_permuted_output, optimized_output, PermutationVector(1U, 2U, 0U));
+        // Configure depthwise
+        _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier, dilation);
 
-            // Allocate tensors
-            _permuted_input.allocator()->allocate();
-            _permuted_weights.allocator()->allocate();
-            _permuted_output.allocator()->allocate();
-        }
-        else
-        {
-            _dwc_kernel.configure(input, weights, optimized_output, conv_info, depth_multiplier, DataLayout::NHWC);
-        }
+        // Configure border handler
+        _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+
+        // Allocate tensors
+        _permuted_input.allocator()->allocate();
     }
     else
     {
-        if(!_is_nchw)
-        {
-            // Configure the function to transform the input tensor from NHWC -> NCHW
-            _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
-            _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+        // Configure depthwise convolution kernel
+        _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier, dilation);
 
-            // Configure the function to transform the weights tensor from HWI -> IHW
-            _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
-            _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
-
-            // Configure optimized depthwise
-            _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier);
-
-            // Configure border handler
-            _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
-
-            // Allocate tensors
-            _permuted_input.allocator()->allocate();
-            _permuted_weights.allocator()->allocate();
-        }
-        else
-        {
-            // Configure depthwise convolution kernel
-            _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier);
-
-            // Configure border handler
-            _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
-        }
+        // Configure border handler
+        _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
     }
 
     // Configure biases accumulation
@@ -145,37 +112,138 @@
         const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
 
         float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
-        int   output_multiplier, output_shift;
+        int   output_multiplier;
+        int   output_shift;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-        _output_stage_kernel.configure(&_accumulator, biases, (_is_nchw || _is_optimized) ? output : &_permuted_output, output_multiplier, output_shift, output_quant_info.offset);
+        _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, output_quant_info.offset);
         _accumulator.allocator()->allocate();
     }
     else if(_has_bias)
     {
-        _output_stage_kernel.configure((_is_nchw || _is_optimized) ? output : &_permuted_output, biases);
+        _output_stage_kernel.configure(_is_nchw ? output : &_permuted_output, biases);
     }
 
-    if(!_is_optimized && !_is_nchw)
+    // Permute output
+    if(!_is_nchw)
     {
         // Configure the function to transform the convoluted output to NHWC
         _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
         _permuted_output.allocator()->allocate();
     }
+}
 
-    //Configure Activation Layer
+void NEDepthwiseConvolutionLayer3x3::configure_optimized(const ITensor             *input,
+                                                         const ITensor             *weights,
+                                                         const ITensor             *biases,
+                                                         ITensor                   *output,
+                                                         const PadStrideInfo       &conv_info,
+                                                         unsigned int               depth_multiplier,
+                                                         const ActivationLayerInfo &act_info)
+{
+    ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
+    const bool          is_relu         = arm_compute::utils::info_helpers::is_relu(act_info);
+    const bool          is_relu6        = arm_compute::utils::info_helpers::is_relu6(act_info);
+    _is_activationlayer_enabled         = act_info.enabled() && !(is_relu || is_relu6);
+    if(!_is_activationlayer_enabled)
+    {
+        act_info_to_use = act_info;
+    }
+
+    if(_is_nchw)
+    {
+        _memory_group.manage(&_permuted_input);
+        _memory_group.manage(&_permuted_output);
+
+        // Configure the function to transform the input tensor from NCHW -> NHWC
+        _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+        _permuted_input.info()->set_data_layout(DataLayout::NHWC);
+
+        // Configure the function to transform the weights tensor from IHW -> HWI
+        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+        _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+
+        // Configure optimized depthwise
+        _dwc_optimized_func.configure(&_permuted_input, &_permuted_weights, biases, &_permuted_output, conv_info, depth_multiplier, act_info_to_use);
+
+        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+        _permuted_output.info()->set_data_layout(DataLayout::NHWC);
+        _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
+
+        // Allocate tensors
+        _permuted_input.allocator()->allocate();
+        _permuted_output.allocator()->allocate();
+    }
+    else
+    {
+        _dwc_optimized_func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info_to_use);
+    }
+}
+
+void NEDepthwiseConvolutionLayer3x3::configure(ITensor       *input,
+                                               const ITensor *weights,
+                                               const ITensor *biases,
+                                               ITensor *output, const PadStrideInfo &conv_info,
+                                               unsigned int               depth_multiplier,
+                                               const ActivationLayerInfo &act_info,
+                                               const Size2D              &dilation)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+
+    // idx_w and idx_h only used for validation
+    const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_UNUSED(idx_w);
+    ARM_COMPUTE_UNUSED(idx_h);
+
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+
+    _original_weights = weights;
+    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _has_bias         = biases != nullptr;
+    _is_optimized     = NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input->info(),
+                                                                                       weights->info(),
+                                                                                       conv_info,
+                                                                                       depth_multiplier, dilation);
+    _is_nchw                    = input->info()->data_layout() == DataLayout::NCHW;
+    _permute                    = _is_optimized == _is_nchw;
+    _is_prepared                = false;
     _is_activationlayer_enabled = act_info.enabled();
 
+    // Configure appropriate pipeline
+    if(_is_optimized)
+    {
+        configure_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+    }
+    else
+    {
+        configure_generic(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+    }
+
+    // Configure activation
     if(_is_activationlayer_enabled)
     {
         _activationlayer_function.configure(output, nullptr, act_info);
     }
 }
 
-Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo         *input,
+                                                const ITensorInfo         *weights,
+                                                const ITensorInfo         *biases,
+                                                const ITensorInfo         *output,
+                                                const PadStrideInfo       &conv_info,
+                                                unsigned int               depth_multiplier,
+                                                const ActivationLayerInfo &act_info,
+                                                const Size2D              &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
+    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
 
     if(biases != nullptr)
     {
@@ -184,14 +252,20 @@
         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
     }
 
-    const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
-    TensorInfo accumulator  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
-
-    ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier));
-
-    if(is_quantized)
+    if(!NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation))
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, biases, output));
+        const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+        TensorInfo accumulator  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier));
+
+        if(is_quantized)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, biases, output));
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, conv_info, depth_multiplier));
     }
 
     //Validate Activation Layer
@@ -203,43 +277,14 @@
     return Status{};
 }
 
-void NEDepthwiseConvolutionLayer3x3::run()
+void NEDepthwiseConvolutionLayer3x3::run_generic()
 {
-    if(_is_first_run && _is_optimized)
-    {
-        _is_first_run = false;
-        // Create convolver (deferred)
-        _dwc_kernel.generate_convolver();
-    }
-
-    // Permute weights
-    if(_permute)
-    {
-        if(!_are_weights_reshaped)
-        {
-            _are_weights_reshaped = true;
-            _permute_weights.run();
-        }
-
-        _permute_input.run();
-    }
-
-    // Handle input
-    if(!_is_optimized)
-    {
-        // Fill border
-        NEScheduler::get().schedule(&_border_handler, Window::DimX);
-    }
+    // Fill border
+    NEScheduler::get().schedule(&_border_handler, Window::DimX);
 
     // Execute depthwise convolution
     NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
 
-    // Permute output
-    if(_is_optimized && _is_nchw)
-    {
-        _permute_output.run();
-    }
-
     // Add biases
     if(_has_bias || _is_quantized)
     {
@@ -247,17 +292,71 @@
     }
 
     // Permute output
-    if(!_is_optimized && !_is_nchw)
+    if(!_is_nchw)
     {
         _permute_output.run();
     }
+}
 
+void NEDepthwiseConvolutionLayer3x3::run_optimized()
+{
+    // Run assembly function
+    _dwc_optimized_func.run();
+
+    // Permute output
+    if(_is_nchw)
+    {
+        _permute_output.run();
+    }
+}
+
+void NEDepthwiseConvolutionLayer3x3::run()
+{
+    prepare();
+
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    // Permute input
+    if(_permute)
+    {
+        _permute_input.run();
+    }
+
+    _is_optimized ? run_optimized() : run_generic();
+
+    // Run activation
     if(_is_activationlayer_enabled)
     {
         _activationlayer_function.run();
     }
 }
 
+void NEDepthwiseConvolutionLayer3x3::prepare()
+{
+    if(!_is_prepared)
+    {
+        // Permute weights
+        if(_permute)
+        {
+            _permuted_weights.allocator()->allocate();
+            _permute_weights.run();
+            _original_weights->mark_as_unused();
+        }
+
+        // Prepare optimized function
+        if(_is_optimized)
+        {
+            _dwc_optimized_func.prepare();
+            if(!_permuted_weights.is_used())
+            {
+                _permuted_weights.allocator()->free();
+            }
+        }
+
+        _is_prepared = true;
+    }
+}
+
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
     : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _permute_input(),
       _permute_weights(), _permute_output(), _activationlayer_function(), _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _permuted_input(), _permuted_weights(),
@@ -266,14 +365,21 @@
 }
 
 void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                            unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+                                            unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     const unsigned int channel_idx = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
     ARM_COMPUTE_UNUSED(channel_idx);
-
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_ERROR_ON((input->info()->dimension(channel_idx) * depth_multiplier) != weights->info()->dimension(channel_idx));
+    // idx_w and idx_h only used for validation
+    const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_UNUSED(idx_w);
+    ARM_COMPUTE_UNUSED(idx_h);
+
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
 
     _is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
 
@@ -304,7 +410,7 @@
     bool append_bias = (biases != nullptr) && !_is_quantized;
 
     // Calculate output shape
-    TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+    TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
@@ -332,7 +438,7 @@
     shape_im2col.set(1, conv_size);
     shape_im2col.set(2, weights_z);
     _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
-    _im2col_kernel.configure(input_to_use, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
+    _im2col_kernel.configure(input_to_use, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation);
 
     // Weights reshape configuration
     const TensorShape shape_weights_reshape(patch_size, weights_z);
@@ -356,7 +462,8 @@
         const QuantizationInfo output_quant_info = output->info()->quantization_info();
 
         float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
-        int   output_multiplier, output_shift;
+        int   output_multiplier;
+        int   output_shift;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
         _output_stage_kernel.configure(&_output_reshaped, biases, output_to_use, output_multiplier, output_shift, output_quant_info.offset);
         _output_reshaped.allocator()->allocate();
@@ -399,14 +506,17 @@
 }
 
 Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                             unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+                                             unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
 
     const unsigned int width_idx  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const unsigned int height_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) + (weights->dimension(width_idx) - 1) * (dilation.x() - 1) > input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(height_idx) + (weights->dimension(height_idx) - 1) * (dilation.y() - 1) > input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom());
     // Clone output to use auto init
     auto output_clone = output->clone();
 
@@ -433,7 +543,7 @@
 
     const bool         is_quantized = is_data_type_quantized_asymmetric(input->data_type());
     const bool         append_bias  = (biases != nullptr) && !is_quantized;
-    TensorShape        output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+    TensorShape        output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
     const size_t       weights_w    = weights_to_use->dimension(0);
     const size_t       weights_h    = weights_to_use->dimension(1);
     const size_t       weights_z    = weights_to_use->dimension(2);
@@ -460,7 +570,7 @@
     shape_im2col.set(1, conv_size);
     shape_im2col.set(2, weights_z);
     TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseIm2ColKernel::validate(input_to_use, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseIm2ColKernel::validate(input_to_use, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation));
 
     // Weights reshape configuration
     const TensorShape shape_weights_reshape(patch_size, weights_z);
@@ -542,3 +652,4 @@
         _is_prepared = true;
     }
 }
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
index 0627977..e92b4bf 100644
--- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,34 +24,20 @@
 
 #include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
 
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h"
+#include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-
-NEDequantizationLayer::NEDequantizationLayer()
-    : _dequantize_kernel()
+namespace arm_compute
 {
+void NEDequantizationLayer::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEDequantizationLayerKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
 }
 
-Status NEDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status NEDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayerKernel::validate(input, output, min_max));
-
-    return Status{};
+    return NEDequantizationLayerKernel::validate(input, output);
 }
-
-void NEDequantizationLayer::configure(const ITensor *input, ITensor *output, const ITensor *min_max)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
-
-    // Configure kernel
-    _dequantize_kernel.configure(input, output, min_max);
-}
-
-void NEDequantizationLayer::run()
-{
-    NEScheduler::get().schedule(&_dequantize_kernel, Window::DimY);
-}
\ No newline at end of file
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 40e40c8..322bb2c 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -105,7 +105,7 @@
 {
     NEScheduler::get().schedule(&_input_border_handler, Window::DimZ);
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     NEScheduler::get().schedule(&_conv_kernel, _dim_split);
     if(_has_bias)
@@ -117,5 +117,4 @@
     {
         _activationlayer_function.run();
     }
-    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEFFT1D.cpp b/src/runtime/NEON/functions/NEFFT1D.cpp
new file mode 100644
index 0000000..25ba1c8
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFFT1D.cpp

@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFFT1D.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/fft.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEFFT1D::NEFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _axis(0), _run_scale(false)
+{
+}
+
+void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo &config)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(NEFFT1D::validate(input->info(), output->info(), config));
+
+    // Decompose size to radix factors
+    const auto         supported_radix   = NEFFTRadixStageKernel::supported_radix();
+    const unsigned int N                 = input->info()->tensor_shape()[config.axis];
+    const auto         decomposed_vector = arm_compute::helpers::fft::decompose_stages(N, supported_radix);
+    ARM_COMPUTE_ERROR_ON(decomposed_vector.empty());
+
+    // Flags
+    _run_scale = config.direction == FFTDirection::Inverse;
+
+    const bool is_c2r = input->info()->num_channels() == 2 && output->info()->num_channels() == 1;
+
+    // Configure digit reverse
+    FFTDigitReverseKernelInfo digit_reverse_config;
+    digit_reverse_config.axis      = config.axis;
+    digit_reverse_config.conjugate = config.direction == FFTDirection::Inverse;
+    TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32);
+    _digit_reverse_indices.allocator()->init(digit_reverse_indices_info);
+    _memory_group.manage(&_digit_reversed_input);
+    _digit_reverse_kernel.configure(input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
+
+    // Create and configure FFT kernels
+    unsigned int Nx = 1;
+    _num_ffts       = decomposed_vector.size();
+    _fft_kernels.resize(_num_ffts);
+    _axis = config.axis;
+
+    for(unsigned int i = 0; i < _num_ffts; ++i)
+    {
+        const unsigned int radix_for_stage = decomposed_vector.at(i);
+
+        FFTRadixStageKernelInfo fft_kernel_info;
+        fft_kernel_info.axis           = config.axis;
+        fft_kernel_info.radix          = radix_for_stage;
+        fft_kernel_info.Nx             = Nx;
+        fft_kernel_info.is_first_stage = (i == 0);
+        _fft_kernels[i].configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+
+        Nx *= radix_for_stage;
+    }
+
+    // Configure scale kernel
+    if(_run_scale)
+    {
+        FFTScaleKernelInfo scale_config;
+        scale_config.scale     = static_cast<float>(N);
+        scale_config.conjugate = config.direction == FFTDirection::Inverse;
+        is_c2r ? _scale_kernel.configure(&_digit_reversed_input, output, scale_config) : _scale_kernel.configure(output, nullptr, scale_config);
+    }
+
+    // Allocate tensors
+    _digit_reversed_input.allocator()->allocate();
+    _digit_reverse_indices.allocator()->allocate();
+
+    // Init digit reverse indices
+    const auto digit_reverse_cpu = arm_compute::helpers::fft::digit_reverse_indices(N, decomposed_vector);
+    std::copy_n(digit_reverse_cpu.data(), N, reinterpret_cast<unsigned int *>(_digit_reverse_indices.buffer()));
+}
+
+Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, const FFT1DInfo &config)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+
+    // Check if FFT is decomposable
+    const auto         supported_radix   = NEFFTRadixStageKernel::supported_radix();
+    const unsigned int N                 = input->tensor_shape()[config.axis];
+    const auto         decomposed_vector = arm_compute::helpers::fft::decompose_stages(N, supported_radix);
+    ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty());
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        // All combinations are supported except real input with real output (i.e., both input channels set to 1)
+        ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() > 2);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+void NEFFT1D::run()
+{
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    NEScheduler::get().schedule(&_digit_reverse_kernel, (_axis == 0 ? Window::DimY : Window::DimZ));
+
+    for(unsigned int i = 0; i < _num_ffts; ++i)
+    {
+        NEScheduler::get().schedule(&_fft_kernels[i], (_axis == 0 ? Window::DimY : Window::DimX));
+    }
+
+    // Run output scaling
+    if(_run_scale)
+    {
+        NEScheduler::get().schedule(&_scale_kernel, Window::DimY);
+    }
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEFFT2D.cpp b/src/runtime/NEON/functions/NEFFT2D.cpp
new file mode 100644
index 0000000..9210ecf
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFFT2D.cpp

@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFFT2D.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/Scheduler.h"
+
+namespace arm_compute
+{
+NEFFT2D::NEFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor()
+{
+}
+
+void NEFFT2D::configure(const ITensor *input, ITensor *output, const FFT2DInfo &config)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(NEFFT2D::validate(input->info(), output->info(), config));
+
+    // Setup first pass
+    FFT1DInfo first_pass_config;
+    first_pass_config.axis      = config.axes.first;
+    first_pass_config.direction = config.direction;
+    _memory_group.manage(&_first_pass_tensor);
+    _first_pass_func.configure(input, &_first_pass_tensor, first_pass_config);
+
+    // Setup second pass
+    FFT1DInfo second_pass_config;
+    second_pass_config.axis      = config.axes.second;
+    second_pass_config.direction = config.direction;
+    _second_pass_func.configure(&_first_pass_tensor, output, second_pass_config);
+    _first_pass_tensor.allocator()->allocate();
+}
+
+Status NEFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, const FFT2DInfo &config)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+    // Create intermediate tensor info
+    TensorInfo first_pass_tensor(input->clone()->set_is_resizable(true).reset_padding().set_num_channels(2));
+
+    // Validate first pass
+    FFT1DInfo first_pass_config;
+    first_pass_config.axis      = config.axes.first;
+    first_pass_config.direction = config.direction;
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFFT1D::validate(input, &first_pass_tensor, first_pass_config));
+
+    // Validate second pass
+    FFT1DInfo second_pass_config;
+    second_pass_config.axis      = config.axes.second;
+    second_pass_config.direction = config.direction;
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFFT1D::validate(&first_pass_tensor, output, second_pass_config));
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+void NEFFT2D::run()
+{
+    _memory_group.acquire();
+
+    _first_pass_func.run();
+    _second_pass_func.run();
+
+    _memory_group.release();
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
new file mode 100644
index 0000000..0823007
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp

@@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/fft.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+int pad_decomposable(int N)
+{
+    const auto supported_radix = NEFFTRadixStageKernel::supported_radix();
+
+    int  pad           = 0;
+    bool is_decomposed = false;
+    while(!is_decomposed)
+    {
+        const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix);
+        is_decomposed                = !decomposed_vector.empty();
+        if(!is_decomposed)
+        {
+            ++pad;
+        }
+    }
+    return pad;
+}
+} // namespace
+
+NEFFTConvolutionLayer::NEFFTConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager),
+      _flip_weights_func(),
+      _permute_input_func(),
+      _permute_output_func(),
+      _permute_weights_func(),
+      _permute_bias_func(),
+      _pad_input_func(),
+      _pad_weights_func(),
+      _transform_input_func(memory_manager),
+      _transform_weights_func(),
+      _itransform_output_func(memory_manager),
+      _prod_func(),
+      _reduce_func(),
+      _extract_output_func(),
+      _bias_add_func(),
+      _activation_layer_func(),
+      _permuted_input(),
+      _permuted_weights(),
+      _permuted_bias(),
+      _permuted_output(),
+      _padded_input(),
+      _padded_weights(),
+      _flip_axis(),
+      _flipped_weights(),
+      _transformed_input(),
+      _transformed_weights(),
+      _input_weights_product(),
+      _output_product(),
+      _output_reduced(),
+      _itransformed_output(),
+      _reshaped_output(),
+      _bias_output(),
+      _original_weights(nullptr),
+      _original_bias(nullptr),
+      _is_activationlayer_enabled(false),
+      _needs_permute(false),
+      _has_bias(false),
+      _is_prepared(false)
+{
+}
+
+void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+                                      const ActivationLayerInfo &act_info)
+{
+    _original_weights = weights;
+    _original_bias    = biases;
+
+    // Flat if bias addition is required
+    _has_bias = biases != nullptr;
+
+    // Get indices for the width and height
+    const size_t idx_width  = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+
+    // Input shape, kernel size and output tile
+    const Size2D input_dims  = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+    const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+    const Size2D pad_valid   = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
+                                      pad_decomposable(input_dims.y() + kernel_size.y() - 1));
+    // Tensors to use
+    ITensor       *input_to_use   = input;
+    const ITensor *weights_to_use = weights;
+    ITensor       *output_to_use  = _has_bias ? &_bias_output : output;
+
+    // Permute bias
+    if(biases != nullptr)
+    {
+        _permute_bias_func.configure(biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
+        _permuted_bias.info()->set_data_layout(DataLayout::NCHW);
+    }
+
+    // Permute input if needed
+    _needs_permute = input->info()->data_layout() == DataLayout::NHWC;
+    if(_needs_permute)
+    {
+        _memory_group.manage(&_permuted_input);
+        // Configure the function to transform the input tensor from NHWC -> NCHW
+        _permute_input_func.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+        _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+        // Configure the function to transform the weights tensor from HWI -> IHW
+        _permute_weights_func.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+        _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+
+        input_to_use   = &_permuted_input;
+        weights_to_use = &_permuted_weights;
+    }
+
+    // Flip weights
+    _flipped_weights.allocator()->init(weights_to_use->info()->clone()->set_is_resizable(true).reset_padding());
+    _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
+    _flip_weights_func.configure(weights_to_use, &_flipped_weights, &_flip_axis);
+
+    // Pad weights
+    const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } };
+    _pad_weights_func.configure(&_flipped_weights, &_padded_weights, padding_w);
+
+    // Transform weights
+    _transform_weights_func = support::cpp14::make_unique<NEFFT2D>();
+    _transform_weights_func->configure(&_padded_weights, &_transformed_weights, FFT2DInfo());
+
+    // Pad input
+    const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } };
+    _memory_group.manage(&_padded_input);
+    _pad_input_func.configure(input_to_use, &_padded_input, padding_in);
+    if(_needs_permute)
+    {
+        _permuted_input.allocator()->allocate();
+    }
+
+    // Transform input
+    _memory_group.manage(&_transformed_input);
+    _transform_input_func.configure(&_padded_input, &_transformed_input, FFT2DInfo());
+    _padded_input.allocator()->allocate();
+
+    // Perform product
+    _memory_group.manage(&_output_product);
+    _prod_func.configure(&_transformed_input, &_transformed_weights, &_output_product);
+    _transformed_input.allocator()->allocate();
+
+    // Perform reduction
+    _memory_group.manage(&_output_reduced);
+    _reduce_func.configure(&_output_product, &_output_reduced, 2, ReductionOperation::SUM);
+    _output_product.allocator()->allocate();
+
+    // Transform output
+    _memory_group.manage(&_itransformed_output);
+    FFT2DInfo itranform_info;
+    itranform_info.direction = FFTDirection::Inverse;
+    _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
+    _itransform_output_func.configure(&_output_reduced, &_itransformed_output, itranform_info);
+    _output_reduced.allocator()->allocate();
+
+    // Reshape output
+    TensorShape reshaped_shape = _itransformed_output.info()->tensor_shape();
+    reshaped_shape.remove_dimension(2);
+    _reshaped_output.allocator()->init(_itransformed_output.info()->clone()->set_tensor_shape(reshaped_shape));
+
+    // Extract correct region
+    const int start_left = kernel_size.x() - conv_info.pad_left() - 1;
+    const int start_top  = kernel_size.y() - conv_info.pad_top() - 1;
+    const int end_right  = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
+    const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
+    if(_has_bias)
+    {
+        _memory_group.manage(&_bias_output);
+    }
+    else if(_needs_permute)
+    {
+        output_to_use = &_permuted_output;
+        _memory_group.manage(&_permuted_output);
+    }
+    _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
+    _reshaped_output.allocator()->allocate();
+    _itransformed_output.allocator()->allocate();
+
+    // Add bias
+    if(biases != nullptr)
+    {
+        output_to_use = output;
+        if(_needs_permute)
+        {
+            output_to_use = &_permuted_output;
+            _memory_group.manage(&_permuted_output);
+        }
+        auto_init_if_empty(*output_to_use->info(), *_bias_output.info());
+        _bias_add_func.configure(&_bias_output, &_permuted_bias, output_to_use, ConvertPolicy::WRAP);
+        _bias_output.allocator()->allocate();
+    }
+
+    // Permute output
+    if(_needs_permute)
+    {
+        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+        _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+        _permute_output_func.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+
+        // Allocate tensors
+        _permuted_output.allocator()->allocate();
+    }
+
+    // Configure Activation Layer
+    _is_activationlayer_enabled = act_info.enabled();
+    if(_is_activationlayer_enabled)
+    {
+        _activation_layer_func.configure(output, nullptr, act_info);
+    }
+
+    // Setup flip axis data
+    _flip_axis.allocator()->allocate();
+
+    auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
+    axis_data[0]   = 0;
+    axis_data[1]   = 1;
+}
+
+Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                       const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+
+    // Get indices for the width and height
+    const size_t idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+    // Input shape, kernel size and output tile
+    const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
+
+    // Strides
+    const auto strides = conv_info.stride();
+    ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y());
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2));
+
+    // Validate biases
+    if(biases != nullptr)
+    {
+        const size_t idx_channels = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channels] != biases->tensor_shape().x());
+    }
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
+
+        // Validate Activation Layer
+        if(act_info.enabled())
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
+        }
+    }
+
+    return Status{};
+}
+
+void NEFFTConvolutionLayer::run()
+{
+    prepare();
+
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    // Transform input
+    if(_needs_permute)
+    {
+        _permute_input_func.run();
+    }
+    _pad_input_func.run();
+    _transform_input_func.run();
+
+    // Perform operations to frequency domain
+    _prod_func.run();
+
+    _reduce_func.run();
+
+    // Transform output
+    _itransform_output_func.run();
+    _reshaped_output.allocator()->import_memory(_itransformed_output.buffer());
+    _extract_output_func.run();
+
+    // Add bias
+    if(_has_bias)
+    {
+        _bias_add_func.run();
+    }
+    if(_needs_permute)
+    {
+        _permute_output_func.run();
+    }
+
+    // Run activation layer
+    if(_is_activationlayer_enabled)
+    {
+        _activation_layer_func.run();
+    }
+}
+
+void NEFFTConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        // Permute bias to NCHW
+        if(_original_bias != nullptr)
+        {
+            _permuted_bias.allocator()->allocate();
+            _permute_bias_func.run();
+            _original_bias->mark_as_unused();
+        }
+
+        const ITensor *cur_weights = _original_weights;
+
+        // Permute weights
+        if(_needs_permute)
+        {
+            ARM_COMPUTE_ERROR_ON(!cur_weights->is_used());
+
+            _permuted_weights.allocator()->allocate();
+            _permute_weights_func.run();
+            cur_weights->mark_as_unused();
+            cur_weights = &_permuted_weights;
+        }
+
+        // Flip weights
+        _flipped_weights.allocator()->allocate();
+        _flip_weights_func.run();
+        cur_weights->mark_as_unused();
+
+        // Pad weights
+        _padded_weights.allocator()->allocate();
+        _pad_weights_func.run();
+        _flipped_weights.mark_as_unused();
+        _flipped_weights.allocator()->free();
+
+        // Transform weights to frequency domain
+        _transformed_weights.allocator()->allocate();
+        _transform_weights_func->run();
+        _transform_weights_func.reset();
+
+        _padded_weights.mark_as_unused();
+        _padded_weights.allocator()->free();
+
+        _is_prepared = true;
+    }
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEFastCorners.cpp b/src/runtime/NEON/functions/NEFastCorners.cpp
index 4137b1d..af35301 100644
--- a/src/runtime/NEON/functions/NEFastCorners.cpp
+++ b/src/runtime/NEON/functions/NEFastCorners.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -93,7 +93,7 @@
 {
     NEScheduler::get().schedule(&_border_handler, Window::DimZ);
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     NEScheduler::get().schedule(&_fast_corners_kernel, Window::DimY);
 
@@ -103,6 +103,4 @@
     }
 
     NEScheduler::get().schedule(&_fill_kernel, Window::DimY);
-
-    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 45e21b5..e1a17db 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -228,7 +228,8 @@
     if(_is_quantized)
     {
         float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output->info()->quantization_info().scale;
-        int   output_multiplier, output_shift;
+        int   output_multiplier;
+        int   output_shift;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
         _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset);
         _gemmlowp_output.allocator()->allocate();
@@ -333,7 +334,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Linearize input if it comes from a convolutional layer
     if(_is_fc_after_conv)
@@ -363,8 +364,6 @@
             NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
         }
     }
-
-    _memory_group.release();
 }
 
 void NEFullyConnectedLayer::prepare()

diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 914f088..55bcc45 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp

@@ -238,16 +238,14 @@
 {
     prepare();
 
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
     if(_asm_glue.is_configured())
     {
-        _memory_group.acquire();
         _asm_glue.run();
-        _memory_group.release();
     }
     else
     {
-        _memory_group.acquire();
-
         if(!_run_vector_matrix_multiplication)
         {
             // Run interleave kernel
@@ -262,8 +260,6 @@
 
         NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
 
-        _memory_group.release();
-
         // Run matrix addition kernel
         if(_run_addition)
         {

diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index 470e922..55e067f 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp

@@ -35,7 +35,7 @@
 {
 namespace
 {
-std::unique_ptr<IFunction> create_function_all_types(arm_gemm::KernelDescription gemm_kernel_info,
+std::unique_ptr<IFunction> create_function_all_types(const arm_gemm::KernelDescription &gemm_kernel_info,
                                                      const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
                                                      std::shared_ptr<IMemoryManager> memory_manager)
 
@@ -375,7 +375,7 @@
 
 void NEGEMMAssemblyDispatch::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
     if(_function != nullptr)
     {
         _function->run();
@@ -385,6 +385,5 @@
         ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
         _arm_gemm->run();
     }
-    _memory_group.release();
 }
 } //namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index be7cc2d..a2c4e8a 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -90,16 +90,17 @@
 }
 
 NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
-    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
-      _add_bias_kernel(), _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _append_bias(false),
-      _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
+    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _add_bias_kernel(),
+      _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _append_bias(false), _skip_im2col(false),
+      _skip_col2im(false), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
 {
 }
 
-void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *weights, ITensor *output, int gemm_3d_depth)
+void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act_info, int gemm_3d_depth)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), output->info(), gemm_3d_depth, _skip_im2col));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output == nullptr ? nullptr : output->info(), act_info, gemm_3d_depth,
+                                           _skip_im2col));
 
     const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
                                          gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */);
@@ -114,7 +115,41 @@
         input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
         weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
 
-        _mm_gemmlowp.configure(input, weights, nullptr, output, gemm_info);
+        const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input_quantization_info : output->info()->quantization_info();
+
+        float multiplier = input_quantization_info.scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+        int   output_multiplier;
+        int   output_shift;
+        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+        // Merge activation with output stage
+        int min_activation = 0;
+        int max_activation = 0;
+
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+                                                                                 };
+        if(_is_activationlayer_enabled && supported_acts.count(act_info.activation()) != 0)
+        {
+            const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+            const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+
+            min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
+            max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
+
+            _is_activationlayer_enabled = false;
+        }
+
+        GEMMLowpOutputStageInfo output_info;
+        output_info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+        output_info.gemmlowp_offset     = output_quant_info.offset;
+        output_info.gemmlowp_multiplier = output_multiplier;
+        output_info.gemmlowp_shift      = output_shift;
+        output_info.gemmlowp_min_bound  = min_activation;
+        output_info.gemmlowp_max_bound  = max_activation;
+
+        _mm_gemmlowp.configure(input, weights, biases, output, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info));
 
         // Revert back QuantizatioInfo as input and weights could be used in other convolution layers
         input->info()->set_quantization_info(input_quantization_info);
@@ -127,9 +162,11 @@
     }
 }
 
-Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, int gemm_3d_depth, bool skip_im2col)
+Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ActivationLayerInfo &act_info,
+                                           int gemm_3d_depth, bool skip_im2col)
 {
-    const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+    const bool is_quantized          = is_data_type_quantized_asymmetric(input->data_type());
+    const bool is_activation_enabled = act_info.enabled();
 
     const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
                                          gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */);
@@ -145,8 +182,40 @@
         input_qa->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
         weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
 
+        const QuantizationInfo output_quant_info = (output->total_size() == 0) ? input_quantization_info : output->quantization_info();
+
+        float multiplier = input_quantization_info.scale * weights->quantization_info().scale / output_quant_info.scale;
+        int   output_multiplier;
+        int   output_shift;
+        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+        // Merge activation with output stage
+        int min_activation = 0;
+        int max_activation = 0;
+
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+                                                                                 };
+        if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
+        {
+            const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+            const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+
+            min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
+            max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
+        }
+
+        GEMMLowpOutputStageInfo output_info;
+        output_info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+        output_info.gemmlowp_offset     = output_quant_info.offset;
+        output_info.gemmlowp_multiplier = output_multiplier;
+        output_info.gemmlowp_shift      = output_shift;
+        output_info.gemmlowp_min_bound  = min_activation;
+        output_info.gemmlowp_max_bound  = max_activation;
+
         // Perform validation step on GEMMLowp
-        return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), nullptr, output, gemm_info);
+        return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info));
     }
     else
     {
@@ -155,19 +224,18 @@
     }
 }
 
-Status NEGEMMConvolutionLayer::validate_gemm3d(DataType data_type, int gemm_3d_depth, bool skip_im2col)
+Status NEGEMMConvolutionLayer::validate_gemm3d(const ITensorInfo *input_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
 {
-    const bool         is_quantized          = is_data_type_quantized_asymmetric(data_type);
-    const DataType     output_gemm_data_type = is_quantized ? DataType::S32 : data_type;
-    const unsigned int mult_y                = skip_im2col ? 1U : gemm_3d_depth;
-    const unsigned int mult_z                = skip_im2col ? gemm_3d_depth : 1U;
+    const DataType     data_type = input_info->data_type();
+    const unsigned int mult_y    = skip_im2col ? 1U : gemm_3d_depth;
+    const unsigned int mult_z    = skip_im2col ? gemm_3d_depth : 1U;
 
     // Set dummy tensor shapes for the validation
-    const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type);
+    const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info());
     const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type);
-    const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, output_gemm_data_type);
+    const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info());
 
-    return validate_mm(&dummy_input_info, &dummy_weights_info, &dummy_output_info, gemm_3d_depth, skip_im2col);
+    return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, gemm_3d_depth, skip_im2col);
 }
 
 void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
@@ -202,9 +270,8 @@
     _append_bias                = (biases != nullptr) && (!_is_quantized);
     _is_activationlayer_enabled = act_info.enabled();
 
-    const ITensor *gemm_input_to_use         = input;
-    ITensor       *gemm_output_to_use        = output;
-    ITensor       *gemm_output_staged_to_use = output;
+    const ITensor *gemm_input_to_use  = input;
+    ITensor       *gemm_output_to_use = output;
 
     // Get convolved dimensions
     unsigned int conv_w = 0;
@@ -219,7 +286,7 @@
     // Check if GEMM3D is supported
     if(data_layout == DataLayout::NHWC)
     {
-        _skip_col2im = bool(validate_gemm3d(input->info()->data_type(), conv_h, true));
+        _skip_col2im = bool(validate_gemm3d(input->info(), act_info, conv_h, true));
         // If not supported, we need to perform im2col and col2im (or reshape layer)
         if(!_skip_col2im)
         {
@@ -262,26 +329,17 @@
     }
 
     // Create temporary GEMM output tensor in case we cannot skip col2im
-    if(!_skip_col2im || _is_quantized)
+    if(!_skip_col2im)
     {
-        // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
-        const DataType gemm_data_type = _is_quantized ? DataType::S32 : data_type;
-        TensorShape    shape_gemm;
+        TensorShape shape_gemm;
 
-        if(_is_quantized && _skip_col2im)
-        {
-            shape_gemm = output->info()->tensor_shape();
-        }
-        else
-        {
-            // Calculate GEMM output shape
-            shape_gemm = _im2col_output.info()->tensor_shape();
-            shape_gemm.set(0, mat_weights_cols);
-            shape_gemm.set(1, conv_w * conv_h);
-        }
+        // Calculate GEMM output shape
+        shape_gemm = _im2col_output.info()->tensor_shape();
+        shape_gemm.set(0, mat_weights_cols);
+        shape_gemm.set(1, conv_w * conv_h);
 
         // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
-        TensorInfo info_gemm(shape_gemm, 1, gemm_data_type);
+        TensorInfo info_gemm(shape_gemm, 1, data_type);
         info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout());
         _gemm_output.allocator()->init(info_gemm);
         _memory_group.manage(&_gemm_output);
@@ -293,62 +351,24 @@
     // Configure GEMM
     // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
-    configure_mm(gemm_input_to_use, &_weights_reshaped, gemm_output_to_use, gemm_3d_depth);
+    configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, gemm_3d_depth);
 
     if(!_skip_im2col)
     {
         _im2col_output.allocator()->allocate();
     }
 
-    // Configure output stage for quantized case
-    if(_is_quantized)
-    {
-        const QuantizationInfo input_quant_info  = input->info()->quantization_info();
-        const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input_quant_info : output->info()->quantization_info();
-
-        float multiplier = input_quant_info.scale * weights->info()->quantization_info().scale / output_quant_info.scale;
-        int   output_multiplier, output_shift;
-        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-
-        if(!_skip_col2im)
-        {
-            _memory_group.manage(&_tmp_output);
-            gemm_output_staged_to_use = &_tmp_output;
-        }
-
-        // Merge activation with output stage
-        int min_activation = 0;
-        int max_activation = 0;
-
-        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                                 };
-        if(_is_activationlayer_enabled && supported_acts.count(act_info.activation()) != 0)
-        {
-            const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
-            const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
-
-            min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
-            max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
-
-            _is_activationlayer_enabled = false;
-        }
-
-        _gemmlowp_output_stage.configure(gemm_output_to_use, biases, gemm_output_staged_to_use, output_multiplier, output_shift, output_quant_info.offset, min_activation, max_activation);
-    }
-
     if(!_skip_col2im)
     {
         if(_data_layout == DataLayout::NCHW)
         {
             // Configure col2im
-            _col2im_kernel.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output, Size2D(conv_w, conv_h));
+            _col2im_kernel.configure(gemm_output_to_use, output, Size2D(conv_w, conv_h));
         }
         else
         {
             // Configure reshape layer
-            _reshape_layer.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output);
+            _reshape_layer.configure(gemm_output_to_use, output);
         }
     }
 
@@ -394,11 +414,13 @@
     const unsigned int kernel_width  = weights->dimension(idx_width);
     const unsigned int kernel_height = weights->dimension(idx_height);
 
-    TensorInfo         im2col_reshaped_info, info_gemm, tmp_info, weights_reshaped_info;
-    const ITensorInfo *gemm_input_to_use         = input;
-    const ITensorInfo *gemm_output_to_use        = output;
-    const ITensorInfo *gemm_output_staged_to_use = output;
-    const ITensorInfo *weights_to_use            = weights;
+    TensorInfo         im2col_reshaped_info{};
+    TensorInfo         info_gemm{};
+    TensorInfo         tmp_info{};
+    TensorInfo         weights_reshaped_info{};
+    const ITensorInfo *gemm_input_to_use  = input;
+    const ITensorInfo *gemm_output_to_use = output;
+    const ITensorInfo *weights_to_use     = weights;
 
     const bool is_quantized          = is_data_type_quantized_asymmetric(data_type);
     const bool append_bias           = (biases != nullptr) && (!is_quantized);
@@ -420,7 +442,7 @@
     bool skip_col2im = false;
     if(data_layout == DataLayout::NHWC)
     {
-        skip_col2im = bool(validate_gemm3d(input->data_type(), conv_h, true));
+        skip_col2im = bool(validate_gemm3d(input, act_info, conv_h, true));
         // If not supported, we need to perform im2col and col2im (or reshape layer)
         if(!skip_col2im)
         {
@@ -431,7 +453,7 @@
     if(skip_col2im)
     {
         // If not supported, we need to perform im2col and col2im (or reshape layer)
-        if(!bool(validate_gemm3d(input->data_type(), conv_h, skip_im2col)))
+        if(!bool(validate_gemm3d(input, act_info, conv_h, skip_im2col)))
         {
             skip_im2col = false;
             skip_col2im = false;
@@ -495,68 +517,25 @@
     }
 
     // Create temporary GEMM output tensor in case we cannot skip col2im
-    const DataType gemm_data_type = is_quantized ? DataType::S32 : data_type;
     if(!skip_col2im)
     {
         TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
         shape_gemm.set(0, mat_weights_cols);
         shape_gemm.set(1, conv_w * conv_h);
-        info_gemm = TensorInfo(shape_gemm, 1, gemm_data_type);
+        info_gemm = TensorInfo(shape_gemm, 1, data_type);
     }
     else
     {
-        info_gemm = TensorInfo(output->tensor_shape(), 1, gemm_data_type);
+        info_gemm = TensorInfo(output->tensor_shape(), 1, data_type);
     }
     info_gemm.set_quantization_info(output->quantization_info()).set_data_layout(input->data_layout());
     gemm_output_to_use = &info_gemm;
-
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, gemm_output_to_use, skip_col2im ? conv_h : 0, skip_im2col));
-
-    if(is_quantized)
-    {
-        const QuantizationInfo input_quant_info  = input->quantization_info();
-        const QuantizationInfo output_quant_info = (output->total_size() == 0) ? input_quant_info : output->quantization_info();
-        const float            multiplier        = input_quant_info.scale * weights_to_use->quantization_info().scale / output_quant_info.scale;
-        int                    output_multiplier, output_shift;
-        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-
-        if(!skip_col2im)
-        {
-            tmp_info = TensorInfo(gemm_output_to_use->tensor_shape(), 1, DataType::QASYMM8);
-            tmp_info.set_quantization_info(output->quantization_info()).set_data_layout(data_layout);
-            gemm_output_staged_to_use = &tmp_info;
-        }
-
-        // Merge activation with output stage
-        int min_activation = 0;
-        int max_activation = 0;
-
-        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                                 };
-
-        if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
-        {
-            const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
-            const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
-
-            min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
-            max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
-
-            is_activation_enabled = false;
-        }
-
-        // Validate output stage for quantized case
-        NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(gemm_output_to_use, biases, gemm_output_staged_to_use, min_activation, max_activation);
-    }
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, skip_col2im ? conv_h : 0, skip_im2col));
 
     // Validate Col2Im/ReshapeLayer
     if(!skip_col2im && (data_layout == DataLayout::NCHW))
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(is_quantized ? gemm_output_staged_to_use : gemm_output_to_use,
-                                                             output,
-                                                             Size2D(conv_w, conv_h)));
+        ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(gemm_output_to_use, output, Size2D(conv_w, conv_h)));
     }
 
     //Validate Activation Layer
@@ -572,7 +551,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     if(!_skip_im2col)
     {
@@ -586,9 +565,6 @@
     {
         // Run gemmlowp
         _mm_gemmlowp.run();
-
-        // Run output stage
-        _gemmlowp_output_stage.run();
     }
     else
     {
@@ -618,8 +594,6 @@
     {
         _activationlayer_function.run();
     }
-
-    _memory_group.release();
 }
 
 void NEGEMMConvolutionLayer::prepare()

diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
index 47c3358..ede89bf 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp

@@ -1,4 +1,5 @@
-/* Copyright (c) 2017-2018 ARM Limited.
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -116,7 +117,7 @@
 
 void NEGEMMLowpAssemblyMatrixMultiplyCore::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
     if(_mtx_a_reshape_kernel)
     {
         NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
@@ -135,6 +136,4 @@
     {
         NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
     }
-
-    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 5286f11..54f49a6 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp

@@ -42,8 +42,8 @@
 
 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
-      _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _original_b(nullptr), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false),
-      _dot_product_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
+      _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _mm_result_s32(), _original_b(nullptr), _a_offset(0), _b_offset(0),
+      _run_vector_matrix_multiplication(false), _dot_product_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false)
 {
 }
 
@@ -53,6 +53,9 @@
     ARM_COMPUTE_UNUSED(c);
     ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
 
+    const ITensor *matrix_a = a;
+    const ITensor *matrix_b = b;
+
     // Clear state
     _mtx_a_reshape_kernel = nullptr;
     _mtx_b_reshape_kernel = nullptr;
@@ -65,6 +68,18 @@
     _is_prepared                      = false;
     _original_b                       = b;
 
+    // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
+    if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+    {
+        _fuse_output_stage = true;
+
+        _memory_group.manage(&_mm_result_s32);
+
+        TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);
+
+        _mm_result_s32.allocator()->init(info_mm_result_s32);
+    }
+
 #ifdef __aarch64__
     switch(a->info()->data_type())
     {
@@ -72,7 +87,7 @@
         case DataType::U8:
         case DataType::S8:
         {
-            _asm_glue.configure(a, b, output, 1.f, 0.f, _reshape_b_only_on_first_run);
+            _asm_glue.configure(a, b, _fuse_output_stage ? &_mm_result_s32 : output, 1.f, 0.f, _reshape_b_only_on_first_run);
             _dot_product_path = _asm_glue.is_configured();
             break;
         }
@@ -83,51 +98,35 @@
         }
     }
 #endif /* __aarch64__ */
-    if(!_dot_product_path)
+    if(!(_dot_product_path || _run_vector_matrix_multiplication))
     {
-        if(_run_vector_matrix_multiplication)
+        matrix_a = &_tmp_a;
+        matrix_b = &_tmp_b;
+
+        // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+        TensorInfo a_info(compute_interleaved_shape(*a->info()), 1, a->info()->data_type(), a->info()->quantization_info());
+        // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+        TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info());
+        _tmp_a.allocator()->init(a_info);
+        _tmp_b.allocator()->init(b_info);
+        _memory_group.manage(&_tmp_a);
+        if(!_reshape_b_only_on_first_run)
         {
-            // Configure matrix multiply kernel
-            {
-                auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
-                k->configure(a, b, output);
-                _mm_kernel = std::move(k);
-            }
+            _memory_group.manage(&_tmp_b);
         }
-        else
+
+        // Configure interleave kernel
         {
-            // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-            TensorInfo info_a = a->info()->clone()->set_tensor_shape(compute_interleaved_shape(*a->info())).set_is_resizable(true);
-            // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-            TensorInfo info_b = b->info()->clone()->set_tensor_shape(compute_transpose1xW_shape(*b->info())).set_is_resizable(true);
-            _tmp_a.allocator()->init(info_a);
-            _tmp_b.allocator()->init(info_b);
-            _memory_group.manage(&_tmp_a);
-            if(!_reshape_b_only_on_first_run)
-            {
-                _memory_group.manage(&_tmp_b);
-            }
+            auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+            k->configure(a, &_tmp_a);
+            _mtx_a_reshape_kernel = std::move(k);
+        }
 
-            // Configure interleave kernel
-            {
-                auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
-                k->configure(a, &_tmp_a);
-                _mtx_a_reshape_kernel = std::move(k);
-            }
-
-            // Configure transpose kernel
-            {
-                auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
-                k->configure(b, &_tmp_b);
-                _mtx_b_reshape_kernel = std::move(k);
-            }
-
-            // Configure matrix multiply kernel
-            {
-                auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
-                k->configure(&_tmp_a, &_tmp_b, output);
-                _mm_kernel = std::move(k);
-            }
+        // Configure transpose kernel
+        {
+            auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+            k->configure(b, &_tmp_b);
+            _mtx_b_reshape_kernel = std::move(k);
         }
     }
 
@@ -158,8 +157,33 @@
         _mtx_a_reduction_kernel.configure(a, &_vector_sum_row, a->info()->dimension(0), false);
     }
 
-    // Configure offset contribution kernel
-    _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
+    if(_fuse_output_stage)
+    {
+        // Configure matrix multiply kernel
+        if(!_dot_product_path)
+        {
+            auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+            k->configure(matrix_a, matrix_b, &_mm_result_s32);
+            _mm_kernel = std::move(k);
+        }
+
+        _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
+                                                           _a_offset, _b_offset, gemm_info.gemmlowp_output_stage());
+
+        _mm_result_s32.allocator()->allocate();
+    }
+    else
+    {
+        // Configure matrix multiply kernel
+        if(!_dot_product_path)
+        {
+            auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+            k->configure(matrix_a, matrix_b, output);
+            _mm_kernel = std::move(k);
+        }
+        // Configure offset contribution kernel
+        _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
+    }
 
     // Allocate tensors
     if(!_dot_product_path && !_run_vector_matrix_multiplication)
@@ -185,43 +209,53 @@
 Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
                                     "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
 
+    const ITensorInfo *matrix_a_info = a;
+    const ITensorInfo *matrix_b_info = b;
+
+    TensorInfo tmp_a_info{};
+    TensorInfo tmp_b_info{};
+    TensorInfo mm_result_s32_info{};
+
     int32_t    a_offset                    = a->quantization_info().offset;
     int32_t    b_offset                    = b->quantization_info().offset;
     const bool reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
 
+    bool fuse_output_stage = gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
+    if(fuse_output_stage)
+    {
+        auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
+    }
+
     // Check if we need to run the optimized assembly kernel
-    const bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, output, 1.f, 0.f, reshape_b_only_on_first_run));
+    const bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, fuse_output_stage ? &mm_result_s32_info : output, 1.f, 0.f, reshape_b_only_on_first_run));
 
     if(run_optimised)
     {
-        if(output->total_size() != 0)
+        ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
+        if(gemm_info.depth_output_gemm3d() != 0)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
-            if(gemm_info.depth_output_gemm3d() != 0)
+            if(gemm_info.reinterpret_input_as_3d())
             {
-                if(gemm_info.reinterpret_input_as_3d())
-                {
-                    ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
-                    ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
-                }
-                else
-                {
-                    ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
-                }
+                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
             }
             else
             {
-                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
             }
         }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+        }
     }
     else
     {
@@ -231,6 +265,9 @@
         const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
         if(!run_vector_matrix_multiplication)
         {
+            matrix_a_info = &tmp_a_info;
+            matrix_b_info = &tmp_b_info;
+
             // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
             TensorShape shape_tmp_a = a->tensor_shape();
             shape_tmp_a.set(0, a->dimension(0) * 4);
@@ -241,20 +278,17 @@
             shape_tmp_b.set(0, b->dimension(1) * 16);
             shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
 
-            TensorInfo info_a = a->clone()->set_tensor_shape(shape_tmp_a).set_is_resizable(true);
-            TensorInfo info_b = b->clone()->set_tensor_shape(shape_tmp_b).set_is_resizable(true);
+            // Validate interleave kernel
+            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(shape_tmp_a));
+            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
 
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &tmp_a_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
         }
     }
 
-    TensorInfo info_vector_sum_col, info_vector_sum_row;
+    TensorInfo info_vector_sum_col{};
+    TensorInfo info_vector_sum_row{};
 
     // Validate matrix B reduction kernel only if _a_offset is not equal to 0
     if(a_offset != 0)
@@ -274,12 +308,32 @@
         ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, a->dimension(0), false));
     }
 
-    // Validate offset contribution kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,
-                                                                             a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                             b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                             a_offset, b_offset));
+    if(fuse_output_stage)
+    {
+        if(!run_optimised)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
+        }
 
+        // Validate offset contribution kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
+                                                                                            a_offset == 0 ? nullptr : &info_vector_sum_col,
+                                                                                            b_offset == 0 ? nullptr : &info_vector_sum_row,
+                                                                                            c, output, a_offset, b_offset,
+                                                                                            gemm_info.gemmlowp_output_stage()));
+    }
+    else
+    {
+        if(!run_optimised)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
+        }
+        // Validate offset contribution kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,
+                                                                                 a_offset == 0 ? nullptr : &info_vector_sum_col,
+                                                                                 b_offset == 0 ? nullptr : &info_vector_sum_row,
+                                                                                 a_offset, b_offset));
+    }
     return Status{};
 }
 
@@ -287,7 +341,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Reshape inputs
     if(_mtx_a_reshape_kernel)
@@ -321,10 +375,16 @@
         NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
     }
 
-    // Run offset contribution kernel
-    NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
-
-    _memory_group.release();
+    if(_fuse_output_stage)
+    {
+        // Run offset contribution kernel
+        NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);
+    }
+    else
+    {
+        // Run offset contribution kernel
+        NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
+    }
 }
 
 void NEGEMMLowpMatrixMultiplyCore::prepare()

diff --git a/src/runtime/NEON/functions/NEGaussian5x5.cpp b/src/runtime/NEON/functions/NEGaussian5x5.cpp
index b010ca0..3c7411e 100644
--- a/src/runtime/NEON/functions/NEGaussian5x5.cpp
+++ b/src/runtime/NEON/functions/NEGaussian5x5.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,10 +59,8 @@
 {
     NEScheduler::get().schedule(&_border_handler, Window::DimZ);
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
     NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
-
-    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
index 8a85bba..0dbcb12 100644
--- a/src/runtime/NEON/functions/NEGaussianPyramid.cpp
+++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -74,11 +74,6 @@
 
     if(num_levels > 1)
     {
-        _horizontal_border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
-        _vertical_border_handler   = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
-        _horizontal_reduction      = arm_compute::support::cpp14::make_unique<NEGaussianPyramidHorKernel[]>(num_levels - 1);
-        _vertical_reduction        = arm_compute::support::cpp14::make_unique<NEGaussianPyramidVertKernel[]>(num_levels - 1);
-
         // Apply half scale to the X dimension of the tensor shape
         TensorShape tensor_shape = pyramid->info()->tensor_shape();
         tensor_shape.set(0, (pyramid->info()->width() + 1) * SCALE_PYRAMID_HALF);
@@ -86,19 +81,33 @@
         PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::S16);
         _tmp.init(pyramid_info);
 
+        _horizontal_reduction.reserve(num_levels);
+        _vertical_reduction.reserve(num_levels);
+        _horizontal_border_handler.reserve(num_levels);
+        _vertical_border_handler.reserve(num_levels);
+
         for(unsigned int i = 0; i < num_levels - 1; ++i)
         {
             /* Configure horizontal kernel */
-            _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
+            auto horizontal_kernel = support::cpp14::make_unique<NEGaussianPyramidHorKernel>();
+            horizontal_kernel->configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
 
             /* Configure vertical kernel */
-            _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
+            auto vertical_kernel = support::cpp14::make_unique<NEGaussianPyramidVertKernel>();
+            vertical_kernel->configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
 
             /* Configure border */
-            _horizontal_border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+            auto horizontal_border_kernel = support::cpp14::make_unique<NEFillBorderKernel>();
+            horizontal_border_kernel->configure(_pyramid->get_pyramid_level(i), horizontal_kernel->border_size(), border_mode, PixelValue(constant_border_value));
 
             /* Configure border */
-            _vertical_border_handler[i].configure(_tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16));
+            auto vertical_border_kernel = support::cpp14::make_unique<NEFillBorderKernel>();
+            vertical_border_kernel->configure(_tmp.get_pyramid_level(i), vertical_kernel->border_size(), border_mode, PixelValue(pixel_value_u16));
+
+            _vertical_border_handler.emplace_back(std::move(vertical_border_kernel));
+            _horizontal_border_handler.emplace_back(std::move(horizontal_border_kernel));
+            _vertical_reduction.emplace_back(std::move(vertical_kernel));
+            _horizontal_reduction.emplace_back(std::move(horizontal_kernel));
         }
 
         _tmp.allocate();
@@ -117,10 +126,10 @@
 
     for(unsigned int i = 0; i < num_levels - 1; ++i)
     {
-        NEScheduler::get().schedule(_horizontal_border_handler.get() + i, Window::DimZ);
-        NEScheduler::get().schedule(_horizontal_reduction.get() + i, Window::DimY);
-        NEScheduler::get().schedule(_vertical_border_handler.get() + i, Window::DimZ);
-        NEScheduler::get().schedule(_vertical_reduction.get() + i, Window::DimY);
+        NEScheduler::get().schedule(_horizontal_border_handler[i].get(), Window::DimZ);
+        NEScheduler::get().schedule(_horizontal_reduction[i].get(), Window::DimY);
+        NEScheduler::get().schedule(_vertical_border_handler[i].get(), Window::DimZ);
+        NEScheduler::get().schedule(_vertical_reduction[i].get(), Window::DimY);
     }
 }
 
@@ -147,19 +156,20 @@
 
     if(num_levels > 1)
     {
-        _gaus5x5       = arm_compute::support::cpp14::make_unique<NEGaussian5x5[]>(num_levels - 1);
-        _scale_nearest = arm_compute::support::cpp14::make_unique<NEScale[]>(num_levels - 1);
-
         PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
         _tmp.init(pyramid_info);
 
         for(unsigned int i = 0; i < num_levels - 1; ++i)
         {
             /* Configure gaussian 5x5 */
-            _gaus5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
+            auto gaus5x5_kernel = support::cpp14::make_unique<NEGaussian5x5>();
+            gaus5x5_kernel->configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
+            _gaus5x5.emplace_back(std::move(gaus5x5_kernel));
 
             /* Configure scale */
-            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED);
+            auto scale_kernel = support::cpp14::make_unique<NEScale>();
+            scale_kernel->configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED);
+            _scale_nearest.emplace_back(std::move(scale_kernel));
         }
 
         _tmp.allocate();
@@ -178,7 +188,7 @@
 
     for(unsigned int i = 0; i < num_levels - 1; ++i)
     {
-        _gaus5x5[i].run();
-        _scale_nearest[i].run();
+        _gaus5x5[i].get()->run();
+        _scale_nearest[i].get()->run();
     }
 }

diff --git a/src/runtime/NEON/functions/NEHOGDescriptor.cpp b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
index 5e98269..8efc091 100644
--- a/src/runtime/NEON/functions/NEHOGDescriptor.cpp
+++ b/src/runtime/NEON/functions/NEHOGDescriptor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -95,7 +95,7 @@
 
 void NEHOGDescriptor::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run gradient
     _gradient.run();
@@ -105,6 +105,4 @@
 
     // Run block normalization kernel
     NEScheduler::get().schedule(&_block_norm, Window::DimY);
-
-    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEHOGGradient.cpp b/src/runtime/NEON/functions/NEHOGGradient.cpp
index efc8690..90785fe 100644
--- a/src/runtime/NEON/functions/NEHOGGradient.cpp
+++ b/src/runtime/NEON/functions/NEHOGGradient.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -80,13 +80,11 @@
 
 void NEHOGGradient::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run derivative
     _derivative.run();
 
     // Run magnitude/phase kernel
     NEScheduler::get().schedule(_mag_phase.get(), Window::DimY);
-
-    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
index 8c834e2..26abc9d 100644
--- a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
+++ b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -126,12 +126,12 @@
     _num_block_norm_kernel  = input_block_norm.size(); // Number of NEHOGBlockNormalizationKernel kernels to compute
     _num_hog_detect_kernel  = input_hog_detect.size(); // Number of NEHOGDetector functions to compute
 
-    _orient_bin_kernel = arm_compute::support::cpp14::make_unique<NEHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
-    _block_norm_kernel = arm_compute::support::cpp14::make_unique<NEHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
-    _hog_detect_kernel = arm_compute::support::cpp14::make_unique<NEHOGDetector[]>(_num_hog_detect_kernel);
+    _orient_bin_kernel.reserve(_num_orient_bin_kernel);
+    _block_norm_kernel.reserve(_num_block_norm_kernel);
+    _hog_detect_kernel.reserve(_num_hog_detect_kernel);
+    _hog_space.reserve(_num_orient_bin_kernel);
+    _hog_norm_space.reserve(_num_block_norm_kernel);
     _non_maxima_kernel = arm_compute::support::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
-    _hog_space         = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_orient_bin_kernel);
-    _hog_norm_space    = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_block_norm_kernel);
 
     // Allocate tensors for magnitude and phase
     TensorInfo info_mag(shape_img, Format::S16);
@@ -167,13 +167,17 @@
 
         // Allocate HOG space
         TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
-        _hog_space[i].allocator()->init(info_space);
+        auto       hog_space_tensor = support::cpp14::make_unique<Tensor>();
+        hog_space_tensor->allocator()->init(info_space);
 
         // Manage intermediate buffers
-        _memory_group.manage(_hog_space.get() + i);
+        _memory_group.manage(hog_space_tensor.get());
 
         // Initialise orientation binning kernel
-        _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+        auto orient_bin_kernel = support::cpp14::make_unique<NEHOGOrientationBinningKernel>();
+        orient_bin_kernel->configure(&_mag, &_phase, hog_space_tensor.get(), multi_hog->model(idx_multi_hog)->info());
+        _orient_bin_kernel.emplace_back(std::move(orient_bin_kernel));
+        _hog_space.emplace_back(std::move(hog_space_tensor));
     }
 
     // Allocate intermediate tensors
@@ -188,19 +192,23 @@
 
         // Allocate normalized HOG space
         TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
-        _hog_norm_space[i].allocator()->init(tensor_info);
+        auto       hog_norm_space_tensor = support::cpp14::make_unique<Tensor>();
+        hog_norm_space_tensor->allocator()->init(tensor_info);
 
         // Manage intermediate buffers
-        _memory_group.manage(_hog_norm_space.get() + i);
+        _memory_group.manage(hog_norm_space_tensor.get());
 
         // Initialize block normalization kernel
-        _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+        auto block_norm_kernel = support::cpp14::make_unique<NEHOGBlockNormalizationKernel>();
+        block_norm_kernel->configure(_hog_space[idx_orient_bin].get(), hog_norm_space_tensor.get(), multi_hog->model(idx_multi_hog)->info());
+        _block_norm_kernel.emplace_back(std::move(block_norm_kernel));
+        _hog_norm_space.emplace_back(std::move(hog_norm_space_tensor));
     }
 
     // Allocate intermediate tensors
     for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
     {
-        _hog_space[i].allocator()->allocate();
+        _hog_space[i].get()->allocator()->allocate();
     }
 
     // Configure HOG detector kernel
@@ -208,7 +216,9 @@
     {
         const size_t idx_block_norm = input_hog_detect[i];
 
-        _hog_detect_kernel[i].configure(_hog_norm_space.get() + idx_block_norm, multi_hog->model(i), detection_windows, detection_window_strides->at(i), threshold, i);
+        auto hog_detect_kernel = support::cpp14::make_unique<NEHOGDetector>();
+        hog_detect_kernel->configure(_hog_norm_space[idx_block_norm].get(), multi_hog->model(i), detection_windows, detection_window_strides->at(i), threshold, i);
+        _hog_detect_kernel.emplace_back(std::move(hog_detect_kernel));
     }
 
     // Configure non maxima suppression kernel
@@ -217,7 +227,7 @@
     // Allocate intermediate tensors
     for(size_t i = 0; i < _num_block_norm_kernel; ++i)
     {
-        _hog_norm_space[i].allocator()->allocate();
+        _hog_norm_space[i]->allocator()->allocate();
     }
 }
 
@@ -225,7 +235,7 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Reset detection window
     _detection_windows->clear();
@@ -234,21 +244,21 @@
     _gradient_kernel.run();
 
     // Run orientation binning kernel
-    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    for(auto &kernel : _orient_bin_kernel)
     {
-        NEScheduler::get().schedule(_orient_bin_kernel.get() + i, Window::DimY);
+        NEScheduler::get().schedule(kernel.get(), Window::DimY);
     }
 
     // Run block normalization kernel
-    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    for(auto &kernel : _block_norm_kernel)
     {
-        NEScheduler::get().schedule(_block_norm_kernel.get() + i, Window::DimY);
+        NEScheduler::get().schedule(kernel.get(), Window::DimY);
     }
 
     // Run HOG detector kernel
-    for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
+    for(auto &kernel : _hog_detect_kernel)
     {
-        _hog_detect_kernel[i].run();
+        kernel->run();
     }
 
     // Run non-maxima suppression kernel if enabled
@@ -256,6 +266,4 @@
     {
         NEScheduler::get().schedule(_non_maxima_kernel.get(), Window::DimY);
     }
-
-    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
index db5e926..3eadbee 100644
--- a/src/runtime/NEON/functions/NEHarrisCorners.cpp
+++ b/src/runtime/NEON/functions/NEHarrisCorners.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -90,7 +90,7 @@
     _score.allocator()->init(tensor_info_score);
     _nonmax.allocator()->init(tensor_info_score);
 
-    _corners_list = arm_compute::support::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+    _corners_list.resize(shape.x() * shape.y());
 
     // Set/init Sobel kernel accordingly with gradient_size
     switch(gradient_size)
@@ -171,20 +171,20 @@
     _score.allocator()->allocate();
 
     // Init corner candidates kernel
-    _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
+    _candidates.configure(&_nonmax, _corners_list.data(), &_num_corner_candidates);
 
     // Allocate once all the configure methods have been called
     _nonmax.allocator()->allocate();
 
     // Init euclidean distance
-    _sort_euclidean.configure(_corners_list.get(), corners, &_num_corner_candidates, min_dist);
+    _sort_euclidean.configure(_corners_list.data(), corners, &_num_corner_candidates, min_dist);
 }
 
 void NEHarrisCorners::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Init to 0 number of corner candidates
     _num_corner_candidates = 0;
@@ -207,6 +207,4 @@
 
     // Run sort & euclidean distance
     NEScheduler::get().schedule(&_sort_euclidean, Window::DimY);
-
-    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEHistogram.cpp b/src/runtime/NEON/functions/NEHistogram.cpp
index f333ecb..d56bd7c 100644
--- a/src/runtime/NEON/functions/NEHistogram.cpp
+++ b/src/runtime/NEON/functions/NEHistogram.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,7 @@
 using namespace arm_compute;
 
 NEHistogram::NEHistogram()
-    : _histogram_kernel(), _local_hist(), _window_lut(arm_compute::support::cpp14::make_unique<uint32_t[]>(window_lut_default_size)), _local_hist_size(0)
+    : _histogram_kernel(), _local_hist(), _window_lut(window_lut_default_size), _local_hist_size(0)
 {
 }
 
@@ -45,10 +45,10 @@
 
     // Allocate space for threads local histograms
     _local_hist_size = output->num_bins() * NEScheduler::get().num_threads();
-    _local_hist      = arm_compute::support::cpp14::make_unique<uint32_t[]>(_local_hist_size);
+    _local_hist.resize(_local_hist_size);
 
     // Configure kernel
-    _histogram_kernel.configure(input, output, _local_hist.get(), _window_lut.get());
+    _histogram_kernel.configure(input, output, _local_hist.data(), _window_lut.data());
 }
 
 void NEHistogram::run()

diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index 56da966..c9ab5c9 100644
--- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -68,11 +68,9 @@
 
 void NEL2NormalizeLayer::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     _reduce_func.run();
     NEScheduler::get().schedule(&_normalize_kernel, Window::DimY);
-
-    _memory_group.release();
 }
 } // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp
index 9e7a713..3d3c6a1 100644
--- a/src/runtime/NEON/functions/NELSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayer.cpp

@@ -43,10 +43,10 @@
       _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _gemm_cell_state2(), _transpose_cell_state(), _accum_cell_state1(), _accum_cell_state2(),
       _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), _gemm_output(), _pixelwise_mul_output_state1(), _transpose_output(),
       _accum_output1(), _accum_output2(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _gemm_output_state(), _accum_output_state(),
-      _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _input_gate_out5(),
-      _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(),
-      _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _output5(), _cell_state_activation(), _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false),
-      _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false)
+      _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(),
+      _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(),
+      _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _cell_state_activation(),
+      _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false)
 {
 }
 
@@ -96,22 +96,32 @@
 
     // Configure block that calculates the forget gate
     // forget_gate = Activation(input * input_to_forget_weights + output_state_in * recurrent_to_forget_weights + PixelWiseMul(cell_state, cell_to_forget_weights) + forget_gate_bias)
-    TensorShape forget_gate1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+    // We optimize this as follows:
+    // forget_gate = Activation( (input,output_state_in) * (input_to_forget_weights,recurrent_to_forget_weights) + PixelWiseMul(cell_state, cell_to_forget_weights) + forget_gate_bias)
     _forget_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-    _forget_gate_out2.allocator()->init(TensorInfo(forget_gate1_shape, 1, input->info()->data_type()));
     _forget_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
     _forget_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
-    _memory_group.manage(&_forget_gate_out1);
-    _fully_connected_forget_gate.configure(input, input_to_forget_weights, forget_gate_bias, &_forget_gate_out1);
+    std::vector<const ITensor *> inputs_vector;
+    inputs_vector.emplace_back(input);
+    inputs_vector.emplace_back(output_state_in);
+
     _memory_group.manage(&_forget_gate_out2);
-    _transpose_forget_gate.configure(recurrent_to_forget_weights, &_forget_gate_out2);
-    _memory_group.manage(&_forget_gate_out3);
-    _gemm_forget_gate.configure(output_state_in, &_forget_gate_out2, nullptr, &_forget_gate_out3, 1.f, 0.f);
-    _forget_gate_out2.allocator()->allocate();
+    _concat_inputs_forget_gate.configure(inputs_vector, &_forget_gate_out2);
+
+    std::vector<const ITensor *> weights_vector;
+
+    weights_vector.emplace_back(input_to_forget_weights);
+    weights_vector.emplace_back(recurrent_to_forget_weights);
+
+    _concat_weights_forget_gate.configure(weights_vector, &_forget_gate_out6);
+
     _memory_group.manage(&_forget_gate_out5);
-    _accum_forget_gate1.configure(&_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
-    _forget_gate_out1.allocator()->allocate();
+    _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, forget_gate_bias, &_forget_gate_out5);
+    _memory_group.manage(&_forget_gate_out1);
+    _memory_group.manage(&_forget_gate_out3);
+    _forget_gate_out6.allocator()->allocate();
+
     Tensor *forget_gate_out = &_forget_gate_out5;
     if(lstm_params.has_peephole_opt())
     {
@@ -134,6 +144,8 @@
     // Configure block that calculates the input gate
     // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
     // input_gate = 1 - forget_gate, with CIFG
+    // We optimize this as follows:
+    // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
     _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
     Tensor *input_gate_out = &_input_gate_out1;
     if(lstm_params.has_cifg_opt())
@@ -146,31 +158,29 @@
     }
     else
     {
-        TensorShape input_gate_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
-
-        _input_gate_out2.allocator()->init(TensorInfo(input_gate_shape, 1, input->info()->data_type()));
         _input_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _input_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-        _input_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+        std::vector<const ITensor *> lstm_weights;
+        lstm_weights.emplace_back(lstm_params.input_to_input_weights());
+        lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
+
+        _concat_weights_input_gate.configure(lstm_weights, &_input_gate_out2);
 
         _memory_group.manage(&_input_gate_out1);
-        _fully_connected_input_gate.configure(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &_input_gate_out1);
-        _memory_group.manage(&_input_gate_out2);
-        _transpose_input_gate.configure(lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
-        _memory_group.manage(&_input_gate_out3);
-        _gemm_input_gate.configure(output_state_in, &_input_gate_out2, nullptr, &_input_gate_out3, 1.f, 0.f);
-        _input_gate_out2.allocator()->allocate();
         _memory_group.manage(&_input_gate_out4);
-        _accum_input_gate1.configure(&_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
-        _input_gate_out3.allocator()->allocate();
-        input_gate_out = &_input_gate_out4;
+
+        _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, lstm_params.input_gate_bias(), &_input_gate_out3);
+        _input_gate_out2.allocator()->allocate();
+        input_gate_out = &_input_gate_out3;
+
         if(_run_peephole_opt)
         {
-            _memory_group.manage(&_input_gate_out5);
-            _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-            _accum_input_gate2.configure(&_input_gate_out4, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _memory_group.manage(&_input_gate_out4);
+            _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+            _accum_input_gate2.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _input_gate_out3.allocator()->allocate();
             _input_gate_out4.allocator()->allocate();
-            _input_gate_out5.allocator()->allocate();
             input_gate_out = &_input_gate_out1;
         }
         else
@@ -215,35 +225,37 @@
 
     // Configure block that calculates the output
     // output_state_out = Activation(input * input_to_output_weights + output_state_in * recurrent_to_output_weights + PixelWiseMul(cell_state, cell_to_output_weights) + output_gate_bias)
-    TensorShape output1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+    // We optimize this as follows:
+    // output_state_out = Activation( (input,output_state_in) * (input_to_output_weights, recurrent_to_output_weights) + PixelWiseMul(cell_state, cell_to_output_weights) + output_gate_bias)
     _output1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-    _output2.allocator()->init(TensorInfo(output1_shape, 1, input->info()->data_type()));
-    _output3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-    _output5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _output4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
+    std::vector<const ITensor *> in_out_weights;
+    in_out_weights.emplace_back(input_to_output_weights);
+    in_out_weights.emplace_back(recurrent_to_output_weights);
+
+    _concat_weights_output.configure(in_out_weights, &_output2);
     _memory_group.manage(&_output1);
-    _fully_connected_output.configure(input, input_to_output_weights, output_gate_bias, &_output1);
-    _memory_group.manage(&_output2);
-    _transpose_output.configure(recurrent_to_output_weights, &_output2);
-    _memory_group.manage(&_output3);
-    _gemm_output.configure(output_state_in, &_output2, nullptr, &_output3, 1.f, 0.f);
+    _memory_group.manage(&_output4);
+
+    _fully_connected_output.configure(&_forget_gate_out2, &_output2, output_gate_bias, &_output4);
+
     _output2.allocator()->allocate();
-    _memory_group.manage(&_output5);
-    _accum_output1.configure(&_output1, &_output3, &_output5, ConvertPolicy::SATURATE);
-    _output3.allocator()->allocate();
-    Tensor *output_gate_out = &_output5;
+    _forget_gate_out2.allocator()->allocate();
+
+    Tensor *output_gate_out = &_output4;
     if(lstm_params.has_peephole_opt())
     {
-        _output4.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
+        _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
 
-        _memory_group.manage(&_output4);
-        _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-        _accum_output2.configure(&_output5, &_output4, &_output1, ConvertPolicy::SATURATE);
-        _output5.allocator()->allocate();
+        _memory_group.manage(&_output3);
+        _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _accum_output2.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
+        _output4.allocator()->allocate();
         output_gate_out = &_output1;
 
         // Allocate intermediate buffers
-        _output4.allocator()->allocate();
+        _output3.allocator()->allocate();
     }
     else
     {
@@ -368,10 +380,15 @@
     TensorInfo output_gate_tmp = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
     TensorInfo cell_state_tmp  = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
 
+    std::vector<const ITensorInfo *> inputs_vector;
+    inputs_vector.emplace_back(input);
+    inputs_vector.emplace_back(output_state_in);
+    TensorInfo forget_gate_concat;
+    ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(inputs_vector, &forget_gate_concat));
+
     // Validate forget gate
     ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, &forget_gate));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &forget_gate, 1.f, 0.f, GEMMInfo()));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+
     if(lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
@@ -389,9 +406,13 @@
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
 
+        std::vector<const ITensorInfo *> lstm_weights;
+        lstm_weights.emplace_back(lstm_params.input_to_input_weights());
+        lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
+        TensorInfo lstm_gate_concat;
+        ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(lstm_weights, &lstm_gate_concat));
         ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &input_gate));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &input_gate, 1.f, 0.f, GEMMInfo()));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+
         if(lstm_params.has_peephole_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
@@ -421,9 +442,14 @@
     }
 
     // Validate output gate tmp
+    std::vector<const ITensorInfo *> in_out_weights;
+    in_out_weights.emplace_back(input_to_output_weights);
+    in_out_weights.emplace_back(recurrent_to_output_weights);
+    TensorInfo in_out_gate_concat;
+    ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(in_out_weights, &in_out_gate_concat));
+
     ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_output_weights, output_gate_bias, &output_gate_tmp));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &output_gate_tmp, 1.f, 0.f, GEMMInfo()));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+
     if(lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
@@ -465,12 +491,12 @@
 
 void NELSTMLayer::run()
 {
-    _memory_group.acquire();
+    prepare();
 
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    _concat_inputs_forget_gate.run();
     _fully_connected_forget_gate.run();
-    NEScheduler::get().schedule(&_transpose_forget_gate, Window::DimY);
-    _gemm_forget_gate.run();
-    NEScheduler::get().schedule(&_accum_forget_gate1, Window::DimY);
 
     if(_run_peephole_opt)
     {
@@ -494,9 +520,7 @@
     else
     {
         _fully_connected_input_gate.run();
-        NEScheduler::get().schedule(&_transpose_input_gate, Window::DimY);
-        _gemm_input_gate.run();
-        NEScheduler::get().schedule(&_accum_input_gate1, Window::DimY);
+
         if(_run_peephole_opt)
         {
             NEScheduler::get().schedule(&_pixelwise_mul_input_gate, Window::DimY);
@@ -520,10 +544,6 @@
     }
 
     _fully_connected_output.run();
-    NEScheduler::get().schedule(&_transpose_output, Window::DimY);
-    _gemm_output.run();
-    NEScheduler::get().schedule(&_accum_output1, Window::DimY);
-
     if(_run_peephole_opt)
     {
         NEScheduler::get().schedule(&_pixelwise_mul_output_state1, Window::DimY);
@@ -547,6 +567,18 @@
     NEScheduler::get().schedule(&_copy_output, Window::DimY);
 
     _concat_scratch_buffer.run();
+}
 
-    _memory_group.release();
-}
\ No newline at end of file
+void NELSTMLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        _concat_weights_forget_gate.run();
+        if(!_run_cifg_opt)
+        {
+            _concat_weights_input_gate.run();
+        }
+        _concat_weights_output.run();
+        _is_prepared = true;
+    }
+}

diff --git a/src/runtime/NEON/functions/NELaplacianPyramid.cpp b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
index 0e149d4..5174a13 100644
--- a/src/runtime/NEON/functions/NELaplacianPyramid.cpp
+++ b/src/runtime/NEON/functions/NELaplacianPyramid.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -92,8 +92,8 @@
     // Create Gaussian Pyramid function
     _gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
 
-    _convf = arm_compute::support::cpp14::make_unique<NEGaussian5x5[]>(_num_levels);
-    _subf  = arm_compute::support::cpp14::make_unique<NEArithmeticSubtraction[]>(_num_levels);
+    _convf.resize(_num_levels);
+    _subf.resize(_num_levels);
 
     for(unsigned int i = 0; i < _num_levels; ++i)
     {

diff --git a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
index 9ad9689..b2d889b 100644
--- a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
+++ b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -64,8 +64,8 @@
     _tmp_pyr.init(pyramid_info);
 
     // Allocate add and scale functions. Level 0 does not need to be scaled.
-    _addf   = arm_compute::support::cpp14::make_unique<NEArithmeticAddition[]>(num_levels);
-    _scalef = arm_compute::support::cpp14::make_unique<NEScale[]>(num_levels - 1);
+    _addf.resize(num_levels);
+    _scalef.resize(num_levels - 1);
 
     const size_t last_level = num_levels - 1;
 
@@ -86,7 +86,7 @@
 
 void NELaplacianReconstruct::run()
 {
-    ARM_COMPUTE_ERROR_ON_MSG(_addf == nullptr, "Unconfigured function");
+    ARM_COMPUTE_ERROR_ON_MSG(_addf.empty(), "Unconfigured function");
 
     const size_t last_level = _tmp_pyr.info()->num_levels() - 1;
 

diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index 80a2541..d08202d 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -168,7 +168,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run input reshaping
     NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
@@ -178,8 +178,6 @@
 
     // Reshape output matrix
     NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
-
-    _memory_group.release();
 }
 
 void NELocallyConnectedLayer::prepare()

diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index f00114f..d52e928 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -69,11 +69,9 @@
 
 void NENormalizationLayer::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     NEScheduler::get().schedule(&_multiply_kernel, Window::DimY);
     NEScheduler::get().schedule(&_border_handler, Window::DimY);
     NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
-
-    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEOpticalFlow.cpp b/src/runtime/NEON/functions/NEOpticalFlow.cpp
index e90d8f6..0df01c6 100644
--- a/src/runtime/NEON/functions/NEOpticalFlow.cpp
+++ b/src/runtime/NEON/functions/NEOpticalFlow.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -74,10 +74,10 @@
 
     const float pyr_scale = old_pyramid->info()->scale();
 
-    _func_scharr    = arm_compute::support::cpp14::make_unique<NEScharr3x3[]>(_num_levels);
-    _kernel_tracker = arm_compute::support::cpp14::make_unique<NELKTrackerKernel[]>(_num_levels);
-    _scharr_gx      = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_levels);
-    _scharr_gy      = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_levels);
+    _func_scharr.reserve(_num_levels);
+    _kernel_tracker.reserve(_num_levels);
+    _scharr_gx.reserve(_num_levels);
+    _scharr_gy.reserve(_num_levels);
 
     _old_points_internal = LKInternalKeypointArray(old_points->num_values());
     _new_points_internal = LKInternalKeypointArray(old_points->num_values());
@@ -95,25 +95,34 @@
 
         TensorInfo tensor_info(TensorShape(width_ith, height_ith), Format::S16);
 
-        _scharr_gx[i].allocator()->init(tensor_info);
-        _scharr_gy[i].allocator()->init(tensor_info);
+        auto scharr_gx = support::cpp14::make_unique<Tensor>();
+        auto scharr_gy = support::cpp14::make_unique<Tensor>();
+        scharr_gx->allocator()->init(tensor_info);
+        scharr_gy->allocator()->init(tensor_info);
 
         // Manage intermediate buffers
-        _memory_group.manage(_scharr_gx.get() + i);
-        _memory_group.manage(_scharr_gy.get() + i);
+        _memory_group.manage(scharr_gx.get());
+        _memory_group.manage(scharr_gy.get());
 
         // Init Scharr kernel
-        _func_scharr[i].configure(old_ith_input, _scharr_gx.get() + i, _scharr_gy.get() + i, border_mode, constant_border_value);
+        auto func_scharr = support::cpp14::make_unique<NEScharr3x3>();
+        func_scharr->configure(old_ith_input, scharr_gx.get(), scharr_gy.get(), border_mode, constant_border_value);
 
         // Init Lucas-Kanade kernel
-        _kernel_tracker[i].configure(old_ith_input, new_ith_input, _scharr_gx.get() + i, _scharr_gy.get() + i,
-                                     old_points, new_points_estimates, new_points,
-                                     &_old_points_internal, &_new_points_internal,
-                                     termination, use_initial_estimate, epsilon, num_iterations, window_dimension,
-                                     i, _num_levels, pyr_scale);
+        auto kernel_tracker = support::cpp14::make_unique<NELKTrackerKernel>();
+        kernel_tracker->configure(old_ith_input, new_ith_input, scharr_gx.get(), scharr_gy.get(),
+                                  old_points, new_points_estimates, new_points,
+                                  &_old_points_internal, &_new_points_internal,
+                                  termination, use_initial_estimate, epsilon, num_iterations, window_dimension,
+                                  i, _num_levels, pyr_scale);
 
-        _scharr_gx[i].allocator()->allocate();
-        _scharr_gy[i].allocator()->allocate();
+        scharr_gx->allocator()->allocate();
+        scharr_gy->allocator()->allocate();
+
+        _func_scharr.emplace_back(std::move(func_scharr));
+        _kernel_tracker.emplace_back(std::move(kernel_tracker));
+        _scharr_gx.emplace_back(std::move(scharr_gx));
+        _scharr_gy.emplace_back(std::move(scharr_gy));
     }
 }
 
@@ -121,16 +130,14 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     for(unsigned int level = _num_levels; level > 0; --level)
     {
         // Run Scharr kernel
-        _func_scharr[level - 1].run();
+        _func_scharr[level - 1].get()->run();
 
         // Run Lucas-Kanade kernel
-        NEScheduler::get().schedule(_kernel_tracker.get() + level - 1, Window::DimX);
+        NEScheduler::get().schedule(_kernel_tracker[level - 1].get(), Window::DimX);
     }
-
-    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp
index f5c2718..c608edf 100644
--- a/src/runtime/NEON/functions/NEPadLayer.cpp
+++ b/src/runtime/NEON/functions/NEPadLayer.cpp

@@ -25,7 +25,6 @@
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
-#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
@@ -61,18 +60,28 @@
 
     return coords;
 }
+
+uint32_t last_padding_dimension(const PaddingList &padding)
+{
+    int last_padding_dim = padding.size() - 1;
+    for(; last_padding_dim >= 0; --last_padding_dim)
+    {
+        if(padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0)
+        {
+            break;
+        }
+    }
+    return static_cast<uint32_t>(last_padding_dim);
+}
 } // namespace
 
 NEPadLayer::NEPadLayer()
-    : _memset_kernel(), _copy_kernel(), _output_subtensor()
+    : _copy_kernel(), _mode(), _padding(), _memset_kernel(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results(), _output_subtensor()
 {
 }
 
-void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &padding, PixelValue constant_value)
+void NEPadLayer::configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_THROW_ON_ERROR(NEPadLayer::validate(input->info(), output->info(), padding, constant_value));
-
     // Auto-init
     auto_init_if_empty(*output->info(), get_expected_output_tensorinfo(*input->info(), padding));
 
@@ -86,23 +95,235 @@
     _copy_kernel.configure(input, &_output_subtensor);
 }
 
-Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value)
+void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *output)
+{
+    // Reflecting can be performed by effectively unfolding the input as follows:
+    // For each dimension starting at DimX:
+    //      For before and after:
+    //          Use strided slice to extract and reverse the part of the
+    //          input / previously produced tensor required for the padding.
+    //      Concatenate the before and after padding with the input / previously
+    //      produced tensor along the current dimension.
+
+    // Two strided slice functions will be required for each dimension padded as well as a
+    // concatenate function and the tensors to hold the temporary results.
+    _slice_functions.resize(2 * _num_dimensions);
+    _slice_results.resize(2 * _num_dimensions);
+    _concat_functions.resize(_num_dimensions);
+    _concat_results.resize(_num_dimensions - 1);
+
+    Coordinates starts_before{};
+    Coordinates ends_before{};
+    Coordinates starts_after{};
+    Coordinates ends_after{};
+    Coordinates strides{};
+    ITensor    *prev = input;
+    for(uint32_t i = 0; i < _num_dimensions; ++i)
+    {
+        // Values in strides from the previous dimensions need to be set to 1 to avoid reversing again.
+        if(i > 0)
+        {
+            strides.set(i - 1, 1);
+        }
+
+        if(_padding[i].first > 0 || _padding[i].second > 0)
+        {
+            // Set the starts, ends, and strides values for the current dimension.
+            // Due to the bit masks passed to strided slice, the values below the current dimension in
+            // starts and ends will be ignored so do not need to be modified.
+            if(_mode == PaddingMode::REFLECT)
+            {
+                starts_before.set(i, _padding[i].first);
+                ends_before.set(i, 0);
+                starts_after.set(i, input->info()->dimension(i) - 2);
+                ends_after.set(i, input->info()->dimension(i) - _padding[i].second - 2);
+                strides.set(i, -1);
+            }
+            else
+            {
+                starts_before.set(i, _padding[i].first - 1);
+                ends_before.set(i, -1);
+                starts_after.set(i, input->info()->dimension(i) - 1);
+                ends_after.set(i, input->info()->dimension(i) - _padding[i].second - 1);
+                strides.set(i, -1);
+            }
+
+            // Strided slice wraps negative indexes around to the end of the range,
+            // instead this should indicate use of the full range and so the bit mask will be modified.
+            const int32_t begin_mask_before = starts_before[i] < 0 ? ~0 : ~(1u << i);
+            const int32_t end_mask_before   = ends_before[i] < 0 ? ~0 : ~(1u << i);
+            const int32_t begin_mask_after  = starts_after[i] < 0 ? ~0 : ~(1u << i);
+            const int32_t end_mask_after    = ends_after[i] < 0 ? ~0 : ~(1u << i);
+
+            // Reflect the input values for the padding before and after the input.
+            std::vector<ITensor *> concat_vector;
+            if(_padding[i].first > 0)
+            {
+                if(i < prev->info()->num_dimensions())
+                {
+                    _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, begin_mask_before, end_mask_before);
+                    concat_vector.emplace_back(&_slice_results[2 * i]);
+                }
+                else
+                {
+                    // Performing the slice is unnecessary if the result would simply be a copy of the tensor.
+                    concat_vector.push_back(prev);
+                }
+            }
+            concat_vector.push_back(prev);
+            if(_padding[i].second > 0)
+            {
+                if(i < prev->info()->num_dimensions())
+                {
+                    _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, strides, begin_mask_after, end_mask_after);
+                    concat_vector.emplace_back(&_slice_results[2 * i + 1]);
+                }
+                else
+                {
+                    // Performing the slice is unnecessary if the result would simply be a copy of the tensor.
+                    concat_vector.push_back(prev);
+                }
+            }
+            // Concatenate the padding before and after with the input.
+            ITensor *out = (i == _num_dimensions - 1) ? output : &_concat_results[i];
+            _concat_functions[i].configure(concat_vector, out, i);
+            if(i != _num_dimensions - 1)
+            {
+                _concat_results[i].allocator()->allocate();
+            }
+            prev = out;
+        }
+        _slice_results[2 * i].allocator()->allocate();
+        _slice_results[2 * i + 1].allocator()->allocate();
+    }
+}
+
+void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+{
+    ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
+
+    _padding = padding;
+    _mode    = mode;
+
+    const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding);
+
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(padded_shape));
+
+    // Find the last dimension requiring padding so that it is known when to write to output and whether any padding is applied.
+    _num_dimensions = last_padding_dimension(padding) + 1;
+    if(_num_dimensions > 0)
+    {
+        switch(_mode)
+        {
+            case PaddingMode::CONSTANT:
+            {
+                configure_constant_mode(input, output, padding, constant_value);
+                break;
+            }
+            case PaddingMode::REFLECT:
+            case PaddingMode::SYMMETRIC:
+            {
+                configure_reflect_symmetric_mode(input, output);
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("Padding mode not supported.");
+        }
+    }
+    else
+    {
+        // Copy the input to the whole output if no padding is applied
+        _copy_kernel.configure(input, output);
+    }
+}
+
+Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
 {
     ARM_COMPUTE_UNUSED(constant_value);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
 
-    auto output_clone = output->clone();
+    const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
 
-    SubTensorInfo output_subtensor_info(output_clone.get(), input->tensor_shape(), get_subtensor_coords(padding), true);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output_clone, padding));
-    ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(input, &output_subtensor_info));
+    if(output->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
 
+    switch(mode)
+    {
+        case PaddingMode::CONSTANT:
+        {
+            auto          output_clone = output->clone();
+            SubTensorInfo output_subtensor_info(output_clone.get(), input->tensor_shape(), get_subtensor_coords(padding), true);
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output_clone, padding));
+            ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(input, &output_subtensor_info));
+            break;
+        }
+        case PaddingMode::REFLECT:
+        case PaddingMode::SYMMETRIC:
+        {
+            for(uint32_t i = 0; i < padding.size(); ++i)
+            {
+                if(mode == PaddingMode::REFLECT)
+                {
+                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first >= input->dimension(i));
+                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second >= input->dimension(i));
+                }
+                else
+                {
+                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first > input->dimension(i));
+                    ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second > input->dimension(i));
+                }
+            }
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Invalid mode");
+        }
+    }
     return Status{};
 }
 
 void NEPadLayer::run()
 {
-    NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
-    NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+    if(_num_dimensions > 0)
+    {
+        switch(_mode)
+        {
+            case PaddingMode::CONSTANT:
+            {
+                NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
+                NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+                break;
+            }
+            case PaddingMode::REFLECT:
+            case PaddingMode::SYMMETRIC:
+            {
+                for(uint32_t i = 0; i < _num_dimensions; ++i)
+                {
+                    if(_padding[i].first > 0 || _padding[i].second > 0)
+                    {
+                        if(_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0)
+                        {
+                            _slice_functions[2 * i].run();
+                        }
+                        if(_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0)
+                        {
+                            _slice_functions[2 * i + 1].run();
+                        }
+                        _concat_functions[i].run();
+                    }
+                }
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("Padding mode not supported.");
+        }
+    }
+    else
+    {
+        NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+    }
 }
 } // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index cf6b984..ef28fe9 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,8 +29,8 @@
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NEPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEPixelWiseMultiplicationKernel>();
@@ -51,3 +51,27 @@
 {
     return NEPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy);
 }
+
+void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEComplexPixelWiseMultiplicationKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+
+    if(output->info()->dimension(0) > 1)
+    {
+        ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+        if(broadcasted_info->info()->dimension(0) == 1)
+        {
+            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+        }
+    }
+}
+
+Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return NEComplexPixelWiseMultiplicationKernel::validate(input1, input2, output);
+}
+
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
index 8f7db96..65873b1 100644
--- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,22 +26,13 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
 
 using namespace arm_compute;
 
-NEQuantizationLayer::NEQuantizationLayer()
-    : _quantize_kernel(), _min_max_kernel(), _min_max()
-{
-}
-
 Status NEQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-
-    TensorInfo min_max{ input->num_channels(), input->data_type() };
-    ARM_COMPUTE_RETURN_ON_ERROR(NEMinMaxLayerKernel::validate(input, &min_max));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayerKernel::validate(input, output, &min_max));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayerKernel::validate(input, output));
 
     return Status{};
 }
@@ -50,24 +41,8 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    // Configure min-max kernel. _min_max tensor will be auto-configured within the kernel
-    _min_max_kernel.configure(input, &_min_max);
-
     // Configure quantize kernel
-    _quantize_kernel.configure(input, output, &_min_max);
-
-    // Allocate min_max tensor
-    _min_max.allocator()->allocate();
-}
-
-void NEQuantizationLayer::run()
-{
-    // Reset min and max
-    _min_max_kernel.reset();
-
-    // Run min and max kernel
-    NEScheduler::get().schedule(&_min_max_kernel, Window::DimY);
-
-    // Run quantize kernel
-    NEScheduler::get().schedule(&_quantize_kernel, Window::DimY);
+    auto k = arm_compute::support::cpp14::make_unique<NEQuantizationLayerKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp
index 995d5ee..9ca7ded 100644
--- a/src/runtime/NEON/functions/NERNNLayer.cpp
+++ b/src/runtime/NEON/functions/NERNNLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -104,7 +104,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     _fully_connected_kernel.run();
 
@@ -115,8 +115,6 @@
 
     // copy hidden out to output
     NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
-
-    _memory_group.release();
 }
 
 void NERNNLayer::prepare()

diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index 014895f..0b145f0 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp

@@ -23,6 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
 
+#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
@@ -37,6 +38,8 @@
 {
     ARM_COMPUTE_UNUSED(keep_dims);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
 
     TensorShape        out_shape     = input->tensor_shape();
@@ -78,10 +81,10 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
-    _reduction_ops     = reduction_axis.num_dimensions();
-    _reduction_kernels = arm_compute::support::cpp14::make_unique<NEReductionOperation[]>(_reduction_ops);
-    _reduced_outs      = arm_compute::support::cpp14::make_unique<Tensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
-    _keep_dims         = keep_dims;
+    _reduction_ops = reduction_axis.num_dimensions();
+    _reduction_kernels.resize(_reduction_ops);
+    _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
+    _keep_dims = keep_dims;
 
     Coordinates        axis_local    = reduction_axis;
     const int          input_dims    = input->info()->num_dimensions();
@@ -96,9 +99,9 @@
     // Perform reduction for every axis
     for(unsigned int i = 0; i < _reduction_ops; ++i)
     {
-        TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
+        TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
         out_shape.set(axis_local[i], 1);
-        auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
+        auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
 
         if(i == _reduction_ops - 1 && keep_dims)
         {
@@ -107,8 +110,8 @@
         else
         {
             _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
-            _memory_group.manage(_reduced_outs.get() + i);
-            _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i], ReductionOperation::MEAN_SUM);
+            _memory_group.manage(&_reduced_outs[i]);
+            _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
         }
     }
 
@@ -131,13 +134,13 @@
             out_shape.remove_dimension(axis_local[i] - i);
         }
         auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
-        _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
+        _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
     }
 }
 
 void NEReduceMean::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     for(unsigned int i = 0; i < _reduction_ops; ++i)
     {
@@ -148,5 +151,4 @@
     {
         _reshape.run();
     }
-    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 9f81a40..a0aed96 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp

@@ -66,7 +66,8 @@
 
 void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(NEReductionOperation::validate(input->info(), output->info(), axis, op));
 
     // Configure reduction kernel
     _reduction_kernel.configure(input, output, axis, op);

diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index 483aa4c..425ee6c 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp

@@ -97,14 +97,17 @@
       _dx(),
       _dy(),
       _scale_kernel(),
-      _border_handler()
+      _border_handler(),
+      _use_padding(true)
 {
 }
 
-void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
+void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEScale::validate(input->info(), output->info(), policy, border_mode, constant_border_value, sampling_policy));
+    ARM_COMPUTE_ERROR_THROW_ON(NEScale::validate(input->info(), output->info(), policy, border_mode, constant_border_value, sampling_policy, use_padding));
+
+    _use_padding = use_padding;
 
     // Get data layout and width/height indices
     const DataLayout data_layout = input->info()->data_layout();
@@ -134,7 +137,7 @@
             TensorInfo tensor_info_offsets(shape, Format::S32);
             _offsets.allocator()->init(tensor_info_offsets);
 
-            _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_mode, sampling_policy);
+            _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_mode, constant_border_value, sampling_policy, use_padding);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
@@ -152,7 +155,7 @@
             _dx.allocator()->init(tensor_info_dxdy);
             _dy.allocator()->init(tensor_info_dxdy);
 
-            _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_mode, sampling_policy);
+            _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_mode, constant_border_value, sampling_policy, use_padding);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
@@ -165,18 +168,20 @@
         }
         case InterpolationPolicy::AREA:
         {
-            _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, policy, border_mode);
+            _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, policy, border_mode, constant_border_value);
             break;
         }
         default:
             ARM_COMPUTE_ERROR("Unsupported interpolation mode");
     }
-
-    _border_handler.configure(input, _scale_kernel.border_size(), border_mode, constant_border_value);
+    if(use_padding)
+    {
+        _border_handler.configure(input, _scale_kernel.border_size(), border_mode, constant_border_value);
+    }
 }
 
 Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy,
-                         BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
+                         BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER && sampling_policy != SamplingPolicy::TOP_LEFT);
@@ -213,12 +218,15 @@
     }
 
     ARM_COMPUTE_RETURN_ON_ERROR(NEScaleKernel::validate(input->clone().get(), dx, dy, offsets, output->clone().get(),
-                                                        policy, border_mode, sampling_policy));
+                                                        policy, border_mode, constant_border_value, sampling_policy, use_padding));
     return Status{};
 }
 
 void NEScale::run()
 {
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    if(_use_padding)
+    {
+        NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    }
     NEScheduler::get().schedule(&_scale_kernel, Window::DimY);
 }

diff --git a/src/runtime/NEON/functions/NESobel5x5.cpp b/src/runtime/NEON/functions/NESobel5x5.cpp
index d8f4eda..2ddfee5 100644
--- a/src/runtime/NEON/functions/NESobel5x5.cpp
+++ b/src/runtime/NEON/functions/NESobel5x5.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -81,10 +81,8 @@
 {
     NEScheduler::get().schedule(&_border_handler, Window::DimZ);
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
     NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
-
-    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NESobel7x7.cpp b/src/runtime/NEON/functions/NESobel7x7.cpp
index 5b6f60b..b47a37a 100644
--- a/src/runtime/NEON/functions/NESobel7x7.cpp
+++ b/src/runtime/NEON/functions/NESobel7x7.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -81,10 +81,8 @@
 {
     NEScheduler::get().schedule(&_border_handler, Window::DimZ);
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
     NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
-
-    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 36b7d47..79a9496 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp

@@ -180,7 +180,7 @@
 
 void NESoftmaxLayer::run()
 {
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     if(_needs_flattening)
     {
@@ -195,7 +195,5 @@
     {
         NEScheduler::get().schedule(&_reshape_kernel, Window::DimY);
     }
-
-    _memory_group.release();
 }
 } // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
new file mode 100644
index 0000000..46c28ad
--- /dev/null
+++ b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp

@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NESpaceToBatchLayer::NESpaceToBatchLayer()
+    : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
+{
+}
+
+void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
+
+    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    {
+        _has_padding = true;
+        _memset_kernel.configure(output, PixelValue());
+    }
+    _space_to_batch_kernel.configure(input, block_shape, paddings, output);
+}
+
+void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    {
+        _has_padding = true;
+        _memset_kernel.configure(output, PixelValue());
+    }
+    _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+}
+
+Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
+
+    return Status{};
+}
+
+Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+                                     const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+
+    return Status{};
+}
+
+void NESpaceToBatchLayer::run()
+{
+    // Zero out output only if we have paddings
+    if(_has_padding)
+    {
+        NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
+    }
+    NEScheduler::get().schedule(&_space_to_batch_kernel, Window::DimY);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NESplit.cpp b/src/runtime/NEON/functions/NESplit.cpp
index e947657..0373ab6 100644
--- a/src/runtime/NEON/functions/NESplit.cpp
+++ b/src/runtime/NEON/functions/NESplit.cpp

@@ -42,8 +42,8 @@
 void NESplit::configure(const ITensor *input, const std::vector<ITensor *> &outputs, unsigned int axis)
 {
     // Create Slice functions
-    _num_outputs     = outputs.size();
-    _slice_functions = arm_compute::support::cpp14::make_unique<NESlice[]>(_num_outputs);
+    _num_outputs = outputs.size();
+    _slice_functions.resize(_num_outputs);
 
     // Get output shape
     const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_split_shape(input->info(), axis, _num_outputs);

diff --git a/src/runtime/NEON/functions/NEStackLayer.cpp b/src/runtime/NEON/functions/NEStackLayer.cpp
index 2f49c22..32350b0 100644
--- a/src/runtime/NEON/functions/NEStackLayer.cpp
+++ b/src/runtime/NEON/functions/NEStackLayer.cpp

@@ -43,8 +43,8 @@
 
 void NEStackLayer::configure(const std::vector<ITensor *> &input, int axis, ITensor *output)
 {
-    _num_inputs    = input.size();
-    _stack_kernels = arm_compute::support::cpp14::make_unique<NEStackLayerKernel[]>(_num_inputs);
+    _num_inputs = input.size();
+    _stack_kernels.resize(_num_inputs);
 
     // Wrap around negative values
     const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));

diff --git a/src/runtime/NEON/functions/NEUnstack.cpp b/src/runtime/NEON/functions/NEUnstack.cpp
index 7532020..21f35f8 100644
--- a/src/runtime/NEON/functions/NEUnstack.cpp
+++ b/src/runtime/NEON/functions/NEUnstack.cpp

@@ -74,7 +74,7 @@
     // Wrap around negative values
     const unsigned int axis_u = wrap_axis(axis, input->info());
     _num_slices               = std::min(outputs_vector_info.size(), input->info()->dimension(axis_u));
-    _strided_slice_vector     = arm_compute::support::cpp14::make_unique<NEStridedSlice[]>(_num_slices);
+    _strided_slice_vector.resize(_num_slices);
 
     Coordinates slice_start;
     int32_t     slice_end_mask;

diff --git a/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp b/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
index 7e435c3..25b5216 100644
--- a/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp

@@ -40,14 +40,15 @@
 {
 }
 
-Status NEWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output)
+template <typename TensorInfoType, typename>
+inline Status NEWidthConcatenateLayer::validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
 
     // Output auto inizialitation if not yet initialized
     TensorInfo  tmp_output_info = *output->clone();
-    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
     auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
 
     unsigned int width_offset = 0;
@@ -60,8 +61,8 @@
 
     return Status{};
 }
-
-void NEWidthConcatenateLayer::configure(std::vector<ITensor *> inputs_vector, ITensor *output)
+template <typename TensorType, typename>
+inline void NEWidthConcatenateLayer::configure_internal(std::vector<TensorType *> &&inputs_vector, ITensor *output)
 {
     _num_inputs = inputs_vector.size();
 
@@ -70,7 +71,7 @@
     {
         inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
     }
-    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
@@ -78,7 +79,7 @@
 
     unsigned int width_offset = 0;
 
-    _concat_kernels_vector = arm_compute::support::cpp14::make_unique<NEWidthConcatenateLayerKernel[]>(_num_inputs);
+    _concat_kernels_vector.resize(_num_inputs);
 
     for(unsigned int i = 0; i < _num_inputs; ++i)
     {
@@ -87,10 +88,30 @@
     }
 }
 
+void NEWidthConcatenateLayer::configure(std::vector<ITensor *> inputs_vector, ITensor *output)
+{
+    configure_internal(std::move(inputs_vector), output);
+}
+
+void NEWidthConcatenateLayer::configure(std::vector<const ITensor *> inputs_vector, ITensor *output)
+{
+    configure_internal(std::move(inputs_vector), output);
+}
+
+Status NEWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output)
+{
+    return validate_internal(inputs_vector, output);
+}
+
+Status NEWidthConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output)
+{
+    return validate_internal(inputs_vector, output);
+}
+
 void NEWidthConcatenateLayer::run()
 {
     for(unsigned i = 0; i < _num_inputs; ++i)
     {
-        NEScheduler::get().schedule(_concat_kernels_vector.get() + i, Window::DimY);
+        NEScheduler::get().schedule(&_concat_kernels_vector[i], Window::DimY);
     }
 }

diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index e37f8ab..1513786 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
 #include "support/ToolchainSupport.h"
 
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp"
 
 namespace arm_compute
 {
@@ -162,7 +162,7 @@
     const int        in_channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
     const int        in_batches  = input->info()->dimension(3);
 
-    return Tensor4DShape({ in_batches, in_height, in_width, in_channels });
+    return Tensor4DShape{ in_batches, in_height, in_width, in_channels };
 }
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
@@ -234,12 +234,12 @@
 
 } //namespace
 
-NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
     : _memory_group(memory_manager), _gemm_function(memory_manager), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _activationlayer_function(),
-      _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(), _input(), _weights(), _output(),
-      _is_prepared(false), _is_activationlayer_enabled(false)
+      _permute_input(), _permute_weights(), _permute_output(), _input_transformed(), _output_transformed(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(),
+      _weights_hwio(), _input(), _weights(), _output(), _is_prepared(false), _is_activationlayer_enabled(false)
 {
-} /* arm_compute */
+}
 
 void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
                                            bool enable_fast_math)
@@ -380,20 +380,17 @@
     // Kernel Storage
     const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels,
                                                                                          in_channels)
-                                       * data_type_size
-                                       + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
+                                       * data_type_size;
 
     // Input storage
     const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols,
                                                                                      use_same_padding)
-                                      * data_type_size
-                                      + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
+                                      * data_type_size;
 
     // Output storage
     const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels,
                                                                                         use_same_padding)
-                                       * data_type_size
-                                       + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
+                                       * data_type_size;
     ;
     const KernelShape kernel_shape({ out_channels, static_cast<int>(kernel_size.height), static_cast<int>(kernel_size.width), in_channels });
     const int         kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
@@ -431,14 +428,16 @@
     d_strides.set(2, 0);
     d_strides.set(3, data_type_size * output_matrix_stride);
 
-    TensorInfo a_info, b_info, d_info;
+    TensorInfo a_info{};
+    TensorInfo b_info{};
+    TensorInfo d_info{};
     a_info.init(a_shape, 1, data_type, a_strides, 0, input_storage_size);
     b_info.init(b_shape, 1, data_type, b_strides, 0, kernel_storage_size);
     d_info.init(d_shape, 1, data_type, d_strides, 0, output_storage_size);
 
-    _input_workspace.allocator()->init(a_info, storage_alignment);
+    _input_transformed.allocator()->init(a_info, storage_alignment);
     _kernel_storage.allocator()->init(b_info, storage_alignment);
-    _output_workspace.allocator()->init(d_info, storage_alignment);
+    _output_transformed.allocator()->init(d_info, storage_alignment);
 
     // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
     TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
@@ -446,47 +445,58 @@
                     1, _output->info()->data_type());
     _output_nhwc.allocator()->init(info);
 
-    // Configure the InputTransform
-    _memory_group.manage(&_input_workspace);
-    _memory_group.manage(&_output_workspace);
+    const ITensor     *input_to_use  = _input;
+    ITensor           *output_to_use = _output;
+    PermutationVector  weights_permutation_vector(3U, 0U, 1U, 2U);
+    const unsigned int max_num_threads = NEScheduler::get().num_threads();
 
+    // Configure the kernel to transform the input tensor from NCHW -> NHWC
     if(data_layout == DataLayout::NCHW)
     {
-        // configure the kernel to transform the input tensor from NCHW -> NHWC
+        _memory_group.manage(&_input_nhwc);
         _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
-        _input_nhwc.allocator()->allocate();
-        transform_input_kernel->configure(&_input_nhwc, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
-                                          &_input_workspace, input_matrix_stride);
-
-        // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
-        _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
-
-        transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
-
-        //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
-        _memory_group.manage(&_output_nhwc);
-        transform_output_kernel->configure(biases, &_output_workspace,
-                                           output_matrix_stride, &_output_nhwc,
-                                           in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
-    }
-    else
-    {
-        transform_input_kernel->configure(_input, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
-                                          &_input_workspace, input_matrix_stride);
-
-        // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
-        _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 0U, 1U, 2U));
-
-        transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
-
-        transform_output_kernel->configure(biases, &_output_workspace,
-                                           output_matrix_stride, _output,
-                                           in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
+        input_to_use               = &_input_nhwc;
+        weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U);
     }
 
-    _gemm_function.configure(&_input_workspace, &_kernel_storage, nullptr, &_output_workspace, 1.0f, 0.f);
+    // Configure input transform kernel
+    _memory_group.manage(&_input_transformed);
+    _memory_group.manage(&_input_workspace);
+    transform_input_kernel->configure(input_to_use, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
+                                      &_input_transformed, input_matrix_stride, &_input_workspace);
+    const size_t input_workspace_size = transform_input_kernel->get_working_space_size(max_num_threads);
+    TensorInfo   input_workspace_info(TensorShape(input_workspace_size), 1, _input->info()->data_type());
+    _input_workspace.allocator()->init(input_workspace_info);
     _input_workspace.allocator()->allocate();
+    if(data_layout == DataLayout::NCHW)
+    {
+        _input_nhwc.allocator()->allocate();
+    }
+
+    // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
+    _permute_weights.configure(weights, &_weights_hwio, weights_permutation_vector);
+    transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
+
+    // Configure GEMM function
+    _memory_group.manage(&_output_transformed);
+    _gemm_function.configure(&_input_transformed, &_kernel_storage, nullptr, &_output_transformed, 1.0f, 0.f);
+    _input_transformed.allocator()->allocate();
+
+    // Configure output transform function
+    // The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
+    if(data_layout == DataLayout::NCHW)
+    {
+        _memory_group.manage(&_output_nhwc);
+        output_to_use = &_output_nhwc;
+    }
+    transform_output_kernel->configure(biases, &_output_transformed,
+                                       output_matrix_stride, output_to_use,
+                                       in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels, &_output_workspace);
+    const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads);
+    TensorInfo   output_workspace_info(TensorShape(output_workspace_size), 1, _output->info()->data_type());
+    _output_workspace.allocator()->init(output_workspace_info);
     _output_workspace.allocator()->allocate();
+    _output_transformed.allocator()->allocate();
 
     // Reorder the convoluted output to ACL's ordering NCHW
     if(data_layout == DataLayout::NCHW)
@@ -513,7 +523,7 @@
 
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     if(data_layout == DataLayout::NCHW)
     {
@@ -526,6 +536,7 @@
 
     //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
     _gemm_function.run();
+
     // Transform output tensor to the spatial domain
     NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
 
@@ -539,8 +550,6 @@
     {
         _activationlayer_function.run();
     }
-
-    _memory_group.release();
 }
 
 Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,

diff --git a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
new file mode 100644
index 0000000..049bf66
--- /dev/null
+++ b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp

@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/InfoHelpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+namespace
+{
+std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor      *input,
+                                                                   const ITensor      *weights,
+                                                                   ITensor            *output,
+                                                                   PadStrideInfo       conv_info,
+                                                                   ActivationLayerInfo act_info)
+{
+    const DataType    data_type = input->info()->data_type();
+    const TensorShape shape     = input->info()->tensor_shape();
+
+    const int n_batches      = shape[3];
+    const int in_rows        = shape.z();
+    const int in_cols        = shape.y();
+    const int n_channels     = shape.x();
+    const int padding_top    = conv_info.pad_top();
+    const int padding_left   = conv_info.pad_left();
+    const int padding_bottom = conv_info.pad_bottom();
+    const int padding_right  = conv_info.pad_right();
+
+    const unsigned int stride_x = conv_info.stride().first;
+
+    // Map activation function
+    neon_convolution_kernels::ActivationFunction activation = neon_convolution_kernels::ActivationFunction::None;
+    if(arm_compute::utils::info_helpers::is_relu(act_info))
+    {
+        activation = neon_convolution_kernels::ActivationFunction::ReLU;
+    }
+    else if(arm_compute::utils::info_helpers::is_relu6(act_info))
+    {
+        activation = neon_convolution_kernels::ActivationFunction::ReLU6;
+    }
+
+    // Create quantized convolver
+    if(data_type == DataType::QASYMM8)
+    {
+        const QuantizationInfo &input_qinfo   = input->info()->quantization_info();
+        const QuantizationInfo &weights_qinfo = weights->info()->quantization_info();
+        const QuantizationInfo &output_qinfo  = output->info()->quantization_info();
+
+        // Check that quantization info are in the range [0, 255]
+        ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
+        ARM_COMPUTE_ERROR_ON(weights_qinfo.offset < 0 || weights_qinfo.offset > 255);
+        ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
+        const qasymm8::QAsymm8Params iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
+        const qasymm8::QAsymm8Params wqinfo{ static_cast<uint8_t>(weights_qinfo.offset), weights_qinfo.scale };
+        const qasymm8::QAsymm8Params oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
+
+        // Calculate rescale parameters
+        const float fmultipler  = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+        int         qmultiplier = 0;
+        int         qshift      = 0;
+        quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
+        qasymm8::QAsymm8RescaleParams rescale_params(qshift, qmultiplier, fmultipler);
+
+        // Create convolver
+        switch(stride_x)
+        {
+            case 1:
+                return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
+                           n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+            case 2:
+                return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
+                           n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+            default:
+                return nullptr;
+        }
+    }
+    else
+    {
+        // Create float convolver
+        switch(data_type)
+        {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            case DataType::F16:
+            {
+                switch(stride_x)
+                {
+                    case 1:
+                        return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>>(
+                                   n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+                    case 2:
+                        return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>>(
+                                   n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+                    default:
+                        return nullptr;
+                }
+                break;
+            }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            case DataType::F32:
+            {
+                switch(stride_x)
+                {
+                    case 1:
+                        return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>>(
+                                   n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+                    case 2:
+                        return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>>(
+                                   n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+                    default:
+                        return nullptr;
+                }
+                break;
+            }
+            default:
+                return nullptr;
+        }
+    }
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+NEDepthwiseConvolutionAssemblyDispatch::NEDepthwiseConvolutionAssemblyDispatch(std::shared_ptr<arm_compute::IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr), _packed_weights(), _workspace(), _is_prepared(false), _dwc_assembly_kernel(nullptr),
+      _dwc_acl_kernel()
+{
+}
+#endif /* DOXYGEN_SKIP_THIS */
+
+void NEDepthwiseConvolutionAssemblyDispatch::configure(const ITensor             *input,
+                                                       const ITensor             *weights,
+                                                       const ITensor             *bias,
+                                                       ITensor                   *output,
+                                                       const PadStrideInfo       &conv_info,
+                                                       unsigned int               depth_multiplier,
+                                                       const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_UNUSED(depth_multiplier);
+    ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionAssemblyDispatch::validate(input->info(),
+                                                                                weights->info(),
+                                                                                bias != nullptr ? bias->info() : nullptr,
+                                                                                output->info(),
+                                                                                conv_info,
+                                                                                depth_multiplier,
+                                                                                act_info));
+
+    // Output auto inizialitation if not yet initialized
+    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+
+    _input       = input;
+    _weights     = weights;
+    _bias        = bias;
+    _output      = output;
+    _is_prepared = false;
+
+    // Create convolver
+    _dwc_assembly_kernel = create_convolver(input, weights, output, conv_info, act_info);
+    ARM_COMPUTE_ERROR_ON(_dwc_assembly_kernel == nullptr);
+
+    // Create assembly kernel wrapper
+    _dwc_acl_kernel.configure(_dwc_assembly_kernel.get());
+
+    constexpr size_t alignment = 128;
+
+    // Create workspace
+    const unsigned int num_threads    = NEScheduler::get().num_threads();
+    const size_t       workspace_size = _dwc_assembly_kernel->get_working_space_size(num_threads);
+    ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !");
+    _workspace.allocator()->init(TensorInfo(TensorShape{ workspace_size }, 1, DataType::S8), alignment);
+    _memory_group.manage(&_workspace);
+    _workspace.allocator()->allocate();
+
+    // Create packing tensor
+    const size_t pack_tensor_size = _dwc_assembly_kernel->get_packed_params_size();
+    ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !");
+    _packed_weights.allocator()->init(TensorInfo(TensorShape{ pack_tensor_size }, 1, DataType::S8), alignment);
+}
+
+Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo         *input,
+                                                        const ITensorInfo         *weights,
+                                                        const ITensorInfo         *bias,
+                                                        const ITensorInfo         *output,
+                                                        const PadStrideInfo       &conv_info,
+                                                        unsigned int               depth_multiplier,
+                                                        const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+    const auto       strides     = conv_info.stride();
+    const DataLayout data_layout = input->data_layout();
+    unsigned int     width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    unsigned int     height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != 3 || weights->dimension(height_idx) != 3);
+    ARM_COMPUTE_RETURN_ERROR_ON(!((strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2))));
+    ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier != 1);
+
+    const bool is_relu  = arm_compute::utils::info_helpers::is_relu(act_info);
+    const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !(is_relu || is_relu6));
+
+    // Check bias
+    if(bias != nullptr)
+    {
+        unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(channel_idx));
+    }
+
+    // Check output
+    if(output->total_size() != 0)
+    {
+        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+bool NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITensorInfo *input,
+                                                                    const ITensorInfo *weights,
+                                                                    PadStrideInfo      conv_info,
+                                                                    unsigned int       depth_multiplier,
+                                                                    const Size2D      &dilation)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
+
+    // Reshape input shape if in NHWC format
+    const DataLayout data_layout = input->data_layout();
+    TensorShape      in_shape{ input->tensor_shape() };
+    if(data_layout == DataLayout::NHWC)
+    {
+        in_shape.set(Window::DimX, input->tensor_shape().y());
+        in_shape.set(Window::DimY, input->tensor_shape().z());
+        in_shape.set(Window::DimZ, input->tensor_shape().x());
+    }
+
+    // Check data type
+    const DataType data_type          = weights->data_type();
+    bool           is_data_type_valid = is_data_type_float(data_type) || is_data_type_quantized_asymmetric(data_type);
+
+    // Check weighs size
+    const unsigned int width_idx         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx        = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    bool               weights_supported = (weights->dimension(width_idx) == 3) && (weights->dimension(height_idx) == 3);
+
+    // Check for supported strides
+    const auto &strides           = conv_info.stride();
+    bool        supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2));
+
+    // Check for supported padding
+    const auto    pad_top           = conv_info.pad_top();
+    const auto    pad_right         = conv_info.pad_right();
+    const auto    pad_bottom        = conv_info.pad_bottom();
+    const auto    pad_left          = conv_info.pad_left();
+    PadStrideInfo same_pad          = calculate_same_pad(in_shape, TensorShape(3U, 3U), conv_info);
+    bool          is_same_padding   = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
+    bool          is_valid_padding  = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
+    bool          supported_padding = is_same_padding || is_valid_padding;
+    bool          is_dilation_1     = dilation.x() == 1 && dilation.y() == 1;
+
+    return is_data_type_valid && weights_supported && supported_strides && supported_padding && (depth_multiplier == 1) && is_dilation_1;
+}
+
+void NEDepthwiseConvolutionAssemblyDispatch::run()
+{
+    // Prepare assembly kernel
+    prepare();
+
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    // Setup inputs/outputs
+    ARM_COMPUTE_ERROR_ON(_workspace.buffer() == nullptr);
+    _dwc_assembly_kernel->set_working_space(static_cast<void *>(_workspace.buffer()));
+
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+    const int   input_element_size = _input->info()->element_size();
+    const int   input_batch_stride = _input->info()->strides_in_bytes()[3] / input_element_size;
+    const int   input_row_stride   = _input->info()->strides_in_bytes().z() / input_element_size;
+    const int   input_col_stride   = _input->info()->strides_in_bytes().y() / input_element_size;
+    const void *input_ptr          = _input->buffer() + _input->info()->offset_first_element_in_bytes();
+    _dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride);
+
+    ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
+    const int output_element_size = _output->info()->element_size();
+    const int output_batch_stride = _output->info()->strides_in_bytes()[3] / output_element_size;
+    const int output_row_stride   = _output->info()->strides_in_bytes().z() / output_element_size;
+    const int output_col_stride   = _output->info()->strides_in_bytes().y() / output_element_size;
+    void     *output_ptr          = _output->buffer() + _output->info()->offset_first_element_in_bytes();
+    _dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride);
+
+    // Schedule assembly kernel
+    NEScheduler::get().schedule(&_dwc_acl_kernel, Window::DimX);
+}
+
+void NEDepthwiseConvolutionAssemblyDispatch::prepare()
+{
+    if(!_is_prepared)
+    {
+        _packed_weights.allocator()->allocate();
+        ARM_COMPUTE_ERROR_ON(_packed_weights.buffer() == nullptr);
+
+        // Pack weights and bias
+        const int weights_element_size = _weights->info()->element_size();
+        const int weights_row_stride   = _weights->info()->strides_in_bytes().z() / weights_element_size;
+        const int weights_col_stride   = _weights->info()->strides_in_bytes().y() / weights_element_size;
+        _dwc_assembly_kernel->pack_params(_packed_weights.buffer(),
+                                          _weights->buffer() + _weights->info()->offset_first_element_in_bytes(),
+                                          weights_row_stride,
+                                          weights_col_stride,
+                                          (_bias != nullptr) ? _bias->buffer() : nullptr);
+        _dwc_assembly_kernel->set_packed_params_buffer(_packed_weights.buffer());
+
+        _weights->mark_as_unused();
+        if(_bias != nullptr)
+        {
+            _bias->mark_as_unused();
+        }
+        _is_prepared = true;
+    }
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
index 34aaea0..e207ab0 100644
--- a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
+++ b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp

@@ -183,9 +183,8 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
     NEScheduler::get().run_tagged_workloads(_workloads, _tag.c_str());
-    _memory_group.release();
 }
 
 void NEGEMMInterleavedWrapper::prepare()

diff --git a/src/runtime/Pyramid.cpp b/src/runtime/Pyramid.cpp
index ebd6570..bc7b550 100644
--- a/src/runtime/Pyramid.cpp
+++ b/src/runtime/Pyramid.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,8 +45,8 @@
 
 void Pyramid::internal_init(const PyramidInfo &info, bool auto_padding)
 {
-    _info    = info;
-    _pyramid = arm_compute::support::cpp14::make_unique<Tensor[]>(_info.num_levels());
+    _info = info;
+    _pyramid.resize(_info.num_levels());
 
     size_t      w            = _info.width();
     size_t      h            = _info.height();
@@ -56,11 +56,11 @@
     TensorShape tensor_shape = _info.tensor_shape();
 
     // Note: Look-up table used by the OpenVX sample implementation
-    const float c_orbscale[4] = { 0.5f,
-                                  SCALE_PYRAMID_ORB,
-                                  SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB,
-                                  SCALE_PYRAMID_ORB *SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB
-                                };
+    const std::array<float, 4> c_orbscale = { 0.5f,
+                                              SCALE_PYRAMID_ORB,
+                                              SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB,
+                                              SCALE_PYRAMID_ORB *SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB
+                                            };
 
     for(size_t i = 0; i < _info.num_levels(); ++i)
     {
@@ -71,7 +71,7 @@
             tensor_info.auto_padding();
         }
 
-        (_pyramid.get() + i)->allocator()->init(tensor_info);
+        _pyramid[i].allocator()->init(tensor_info);
 
         if(is_orb_scale)
         {
@@ -99,11 +99,9 @@
 
 void Pyramid::allocate()
 {
-    ARM_COMPUTE_ERROR_ON(_pyramid == nullptr);
-
     for(size_t i = 0; i < _info.num_levels(); ++i)
     {
-        (_pyramid.get() + i)->allocator()->allocate();
+        _pyramid[i].allocator()->allocate();
     }
 }
 
@@ -116,5 +114,5 @@
 {
     ARM_COMPUTE_ERROR_ON(index >= _info.num_levels());
 
-    return (_pyramid.get() + index);
-}
+    return &_pyramid[index];
+}
\ No newline at end of file

diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index 38edb8b..0612d75 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -149,11 +149,11 @@
     info().set_is_resizable(true);
 }
 
-arm_compute::Status TensorAllocator::import_memory(void *memory, size_t size)
+Status TensorAllocator::import_memory(void *memory)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(memory == nullptr);
-    ARM_COMPUTE_RETURN_ERROR_ON(size == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr);
+    ARM_COMPUTE_RETURN_ERROR_ON(alignment() != 0 && !arm_compute::utility::check_aligned(memory, alignment()));
 
     _memory.set_owned_region(support::cpp14::make_unique<MemoryRegion>(memory, info().total_size()));
     info().set_is_resizable(false);
commit	4ba87dbdc3b22220eba4a792c1f5c87e7a88c7af	[log] [tgz]
author	Jenkins <bsgcomp@arm.com>	Thu May 23 17:11:51 2019 +0100
committer	Jenkins <bsgcomp@arm.com>	Thu May 23 17:11:51 2019 +0100
tree	f0364d64c78ffa0b0a86e85457748fbdccf5eb07
parent	29f6788cee8881c5523a042a0ac9b0131d993768 [diff]