arm_compute v17.09

Change-Id: I4bf8f4e6e5f84ce0d5b6f5ba570d276879f42a81
diff --git a/src/runtime/Allocator.cpp b/src/runtime/Allocator.cpp
new file mode 100644
index 0000000..50b0f0e
--- /dev/null
+++ b/src/runtime/Allocator.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Allocator.h"
+
+#include "arm_compute/core/Error.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+void *Allocator::allocate(size_t size, size_t alignment)
+{
+    ARM_COMPUTE_UNUSED(alignment);
+    return ::operator new(size);
+}
+
+void Allocator::free(void *ptr)
+{
+    ::operator delete(ptr);
+}
diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp
new file mode 100644
index 0000000..69292b9
--- /dev/null
+++ b/src/runtime/BlobLifetimeManager.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/BlobLifetimeManager.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/BlobMemoryPool.h"
+#include "arm_compute/runtime/IAllocator.h"
+#include "arm_compute/runtime/IMemoryGroup.h"
+#include "support/ToolchainSupport.h"
+
+#include <algorithm>
+#include <cmath>
+#include <map>
+#include <vector>
+
+using namespace arm_compute;
+
+BlobLifetimeManager::BlobLifetimeManager()
+    : _active_group(nullptr), _active_elements(), _finalized_groups(), _blobs()
+{
+}
+
+void BlobLifetimeManager::register_group(IMemoryGroup *group)
+{
+    if(_active_group == nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON(group == nullptr);
+        _active_group = group;
+    }
+}
+
+void BlobLifetimeManager::start_lifetime(void *obj)
+{
+    ARM_COMPUTE_ERROR_ON(obj == nullptr);
+    ARM_COMPUTE_ERROR_ON_MSG(std::find_if(std::begin(_active_elements), std::end(_active_elements), [&obj](const Element & e)
+    {
+        return obj == e.id;
+    }) != std::end(_active_elements),
+    "Memory object is already registered!");
+
+    // Insert object in groups and mark its finalized state to false
+    _active_elements.emplace_back(obj);
+}
+
+void BlobLifetimeManager::end_lifetime(void *obj, void **handle, size_t size)
+{
+    ARM_COMPUTE_ERROR_ON(obj == nullptr);
+
+    // Find object
+    auto it = std::find_if(std::begin(_active_elements), std::end(_active_elements), [&obj](const Element & e)
+    {
+        return obj == e.id;
+    });
+    ARM_COMPUTE_ERROR_ON(it == std::end(_active_elements));
+
+    // Update object fields and mark object as complete
+    it->handle = handle;
+    it->size   = size;
+    it->status = true;
+
+    // Check if all object are finalized and reset active group
+    if(are_all_finalized())
+    {
+        // Update finalized groups
+        _finalized_groups[_active_group].insert(std::end(_finalized_groups[_active_group]), std::begin(_active_elements), std::end(_active_elements));
+
+        // Update blobs and group mappings
+        update_blobs_and_mappings();
+
+        // Reset state
+        _active_elements.clear();
+        _active_group = nullptr;
+    }
+}
+
+std::unique_ptr<IMemoryPool> BlobLifetimeManager::create_pool(IAllocator *allocator)
+{
+    ARM_COMPUTE_ERROR_ON(allocator == nullptr);
+    return support::cpp14::make_unique<BlobMemoryPool>(allocator, _blobs);
+}
+
+bool BlobLifetimeManager::are_all_finalized() const
+{
+    return !std::any_of(std::begin(_active_elements), std::end(_active_elements), [](const Element e)
+    {
+        return !e.status;
+    });
+}
+
+MappingType BlobLifetimeManager::mapping_type() const
+{
+    return MappingType::BLOBS;
+}
+
+void BlobLifetimeManager::update_blobs_and_mappings()
+{
+    ARM_COMPUTE_ERROR_ON(!are_all_finalized());
+    ARM_COMPUTE_ERROR_ON(_active_group == nullptr);
+
+    // Sort active group requirements in descending order
+    std::sort(std::begin(_active_elements), std::end(_active_elements), [](const Element & a, const Element & b)
+    {
+        return a.size > b.size;
+    });
+    std::vector<size_t> group_sizes;
+    std::transform(std::begin(_active_elements), std::end(_active_elements), std::back_inserter(group_sizes), [](const Element & e)
+    {
+        return e.size;
+    });
+
+    // Update blob sizes
+    size_t max_size = std::max(_blobs.size(), group_sizes.size());
+    _blobs.resize(max_size, 0);
+    group_sizes.resize(max_size, 0);
+    std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), [](size_t lhs, size_t rhs)
+    {
+        return std::max(lhs, rhs);
+    });
+
+    // Calculate group mappings
+    auto &group_mappings = _active_group->mappings();
+    int   blob_idx       = 0;
+    for(auto &e : _active_elements)
+    {
+        group_mappings[e.handle] = blob_idx++;
+    }
+}
diff --git a/src/runtime/BlobMemoryPool.cpp b/src/runtime/BlobMemoryPool.cpp
new file mode 100644
index 0000000..29505e5
--- /dev/null
+++ b/src/runtime/BlobMemoryPool.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/BlobMemoryPool.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/IAllocator.h"
+#include "arm_compute/runtime/IMemoryPool.h"
+#include "arm_compute/runtime/Types.h"
+#include "support/ToolchainSupport.h"
+
+#include <vector>
+
+using namespace arm_compute;
+
+BlobMemoryPool::BlobMemoryPool(IAllocator *allocator, std::vector<size_t> blob_sizes)
+    : _allocator(allocator), _blobs(), _blob_sizes(std::move(blob_sizes))
+{
+    ARM_COMPUTE_ERROR_ON(!allocator);
+    allocate_blobs(_blob_sizes);
+}
+
+BlobMemoryPool::~BlobMemoryPool()
+{
+    ARM_COMPUTE_ERROR_ON(!_allocator);
+    free_blobs();
+}
+
+void BlobMemoryPool::acquire(MemoryMappings &handles)
+{
+    // Set memory to handlers
+    for(auto &handle : handles)
+    {
+        ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
+        *handle.first = _blobs[handle.second];
+    }
+}
+
+void BlobMemoryPool::release(MemoryMappings &handles)
+{
+    for(auto &handle : handles)
+    {
+        ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
+        *handle.first = nullptr;
+    }
+}
+
+MappingType BlobMemoryPool::mapping_type() const
+{
+    return MappingType::BLOBS;
+}
+
+std::unique_ptr<IMemoryPool> BlobMemoryPool::duplicate()
+{
+    ARM_COMPUTE_ERROR_ON(!_allocator);
+    return support::cpp14::make_unique<BlobMemoryPool>(_allocator, _blob_sizes);
+}
+
+void BlobMemoryPool::allocate_blobs(const std::vector<size_t> &sizes)
+{
+    ARM_COMPUTE_ERROR_ON(!_allocator);
+
+    for(const auto &size : sizes)
+    {
+        _blobs.push_back(_allocator->allocate(size, 0));
+    }
+}
+
+void BlobMemoryPool::free_blobs()
+{
+    ARM_COMPUTE_ERROR_ON(!_allocator);
+
+    for(auto &blob : _blobs)
+    {
+        _allocator->free(blob);
+    }
+    _blobs.clear();
+}
\ No newline at end of file
diff --git a/src/runtime/CL/CLBufferAllocator.cpp b/src/runtime/CL/CLBufferAllocator.cpp
new file mode 100644
index 0000000..9a5c13a
--- /dev/null
+++ b/src/runtime/CL/CLBufferAllocator.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLBufferAllocator.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+CLBufferAllocator::CLBufferAllocator(cl::Context context)
+    : _context(std::move(context))
+{
+}
+
+void *CLBufferAllocator::allocate(size_t size, size_t alignment)
+{
+    ARM_COMPUTE_UNUSED(alignment);
+    cl_mem buf = clCreateBuffer(_context.get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, nullptr, nullptr);
+    return static_cast<void *>(buf);
+}
+
+void CLBufferAllocator::free(void *ptr)
+{
+    ARM_COMPUTE_ERROR_ON(ptr == nullptr);
+    clReleaseMemObject(static_cast<cl_mem>(ptr));
+}
diff --git a/src/runtime/CL/CLMultiHOG.cpp b/src/runtime/CL/CLMultiHOG.cpp
index b9e8739..88d45ac 100644
--- a/src/runtime/CL/CLMultiHOG.cpp
+++ b/src/runtime/CL/CLMultiHOG.cpp
@@ -25,12 +25,12 @@
 
 #include "arm_compute/core/CL/ICLHOG.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 CLMultiHOG::CLMultiHOG(size_t num_models)
-    : _num_models(num_models), _model(arm_compute::cpp14::make_unique<CLHOG[]>(_num_models))
+    : _num_models(num_models), _model(arm_compute::support::cpp14::make_unique<CLHOG[]>(_num_models))
 {
 }
 
diff --git a/src/runtime/CL/CLPyramid.cpp b/src/runtime/CL/CLPyramid.cpp
index 41d81ea..865f389 100644
--- a/src/runtime/CL/CLPyramid.cpp
+++ b/src/runtime/CL/CLPyramid.cpp
@@ -24,10 +24,10 @@
 #include "arm_compute/runtime/CL/CLPyramid.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PyramidInfo.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
+#include "support/ToolchainSupport.h"
 
 #include <array>
 #include <cmath>
@@ -52,7 +52,7 @@
 void CLPyramid::internal_init(const PyramidInfo &info, bool auto_padding)
 {
     _info    = info;
-    _pyramid = arm_compute::cpp14::make_unique<CLTensor[]>(_info.num_levels());
+    _pyramid = arm_compute::support::cpp14::make_unique<CLTensor[]>(_info.num_levels());
 
     size_t      w            = _info.width();
     size_t      h            = _info.height();
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index fe25ce5..71a749f 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -24,11 +24,12 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 #include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/runtime/CL/CLTuner.h"
 
 using namespace arm_compute;
 
 CLScheduler::CLScheduler()
-    : _context(), _queue(), _target(GPUTarget::MIDGARD)
+    : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner()
 {
 }
 
@@ -40,10 +41,22 @@
 
 void CLScheduler::enqueue(ICLKernel &kernel, bool flush)
 {
+    ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised,
+                             "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
+                             or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!");
+
+    // Tune the kernel if the CLTuner has been provided
+    if(_cl_tuner != nullptr)
+    {
+        // Tune the OpenCL kernel
+        _cl_tuner->tune_kernel(kernel);
+    }
+
+    // Run kernel
     kernel.run(kernel.window(), _queue);
 
     if(flush)
     {
         _queue.flush();
     }
-}
+}
\ No newline at end of file
diff --git a/src/runtime/CL/CLTensor.cpp b/src/runtime/CL/CLTensor.cpp
index eefa033..bc513d1 100644
--- a/src/runtime/CL/CLTensor.cpp
+++ b/src/runtime/CL/CLTensor.cpp
@@ -28,7 +28,7 @@
 using namespace arm_compute;
 
 CLTensor::CLTensor()
-    : _allocator()
+    : _allocator(this)
 {
 }
 
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index 8112a71..ad165fa 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -25,15 +25,21 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 using namespace arm_compute;
 
-CLTensorAllocator::CLTensorAllocator()
-    : _buffer(), _mapping(nullptr)
+CLTensorAllocator::CLTensorAllocator(CLTensor *owner)
+    : _associated_memory_group(nullptr), _buffer(), _mapping(nullptr), _owner(owner)
 {
 }
 
+CLTensorAllocator::~CLTensorAllocator()
+{
+    _buffer = cl::Buffer();
+}
+
 uint8_t *CLTensorAllocator::data()
 {
     return _mapping;
@@ -47,17 +53,32 @@
 void CLTensorAllocator::allocate()
 {
     ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
-
-    _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info().total_size());
+    if(_associated_memory_group == nullptr)
+    {
+        _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info().total_size());
+    }
+    else
+    {
+        _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(&_buffer()), info().total_size());
+    }
     info().set_is_resizable(false);
 }
 
 void CLTensorAllocator::free()
 {
-    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+    if(_associated_memory_group == nullptr)
+    {
+        _buffer = cl::Buffer();
+        info().set_is_resizable(true);
+    }
+}
 
-    _buffer = cl::Buffer();
-    info().set_is_resizable(true);
+void CLTensorAllocator::set_associated_memory_group(CLMemoryGroup *associated_memory_group)
+{
+    ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
+    ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr);
+    ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
+    _associated_memory_group = associated_memory_group;
 }
 
 uint8_t *CLTensorAllocator::lock()
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
new file mode 100644
index 0000000..7f5be86
--- /dev/null
+++ b/src/runtime/CL/CLTuner.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLTuner.h"
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <chrono>
+#include <limits>
+#include <string>
+
+using namespace arm_compute;
+
+CLTuner::CLTuner()
+    : _lws_table()
+{
+}
+
+void CLTuner::tune_kernel(ICLKernel &kernel)
+{
+    // Get the configuration ID from the kernel
+    const std::string &config_id = kernel.config_id();
+
+    // Check if we need to find the Optimal LWS. If config_id is equal to default_config_id, the kernel does not require to be tuned
+    if(config_id != arm_compute::default_config_id)
+    {
+        auto p = _lws_table.find(config_id);
+
+        if(p == _lws_table.end())
+        {
+            // Find the optimal LWS for the kernel
+            cl::NDRange opt_lws = find_optimal_lws(kernel);
+
+            // Insert the optimal LWS in the table
+            _lws_table.emplace(config_id, opt_lws);
+
+            // Set Local-Workgroup-Size
+            kernel.set_lws_hint(opt_lws);
+        }
+        else
+        {
+            // Set Local-Workgroup-Size
+            kernel.set_lws_hint(p->second);
+        }
+    }
+}
+
+cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+
+    double min_exec_time = std::numeric_limits<double>::max();
+
+    cl::NDRange opt_lws = cl::NDRange(1, 1);
+
+    for(int y = 1; y <= 16; ++y)
+    {
+        for(int x = 1; x <= 16; ++x)
+        {
+            cl::NDRange lws_test = cl::NDRange(x, y);
+
+            //Set the Local-Workgroup-Size
+            kernel.set_lws_hint(lws_test);
+
+            auto t_start = std::chrono::high_resolution_clock::now();
+
+            // Run
+            kernel.run(kernel.window(), q);
+
+            CLScheduler::get().sync();
+
+            auto t_stop = std::chrono::high_resolution_clock::now();
+
+            std::chrono::duration<double, std::nano> fp_nano = t_stop - t_start;
+
+            // Check the execution time
+            if(fp_nano.count() < min_exec_time)
+            {
+                min_exec_time = fp_nano.count();
+                opt_lws       = cl::NDRange(x, y);
+            }
+        }
+    }
+
+    return opt_lws;
+}
+
+void CLTuner::import_lws_table(const std::unordered_map<std::string, cl::NDRange> &lws_table)
+{
+    _lws_table.clear();
+    _lws_table = lws_table;
+}
+
+const std::unordered_map<std::string, cl::NDRange> &CLTuner::export_lws_table()
+{
+    return _lws_table;
+}
\ No newline at end of file
diff --git a/src/runtime/CL/ICLSimpleFunction.cpp b/src/runtime/CL/ICLSimpleFunction.cpp
index aa45743..a1a56fd 100644
--- a/src/runtime/CL/ICLSimpleFunction.cpp
+++ b/src/runtime/CL/ICLSimpleFunction.cpp
@@ -28,8 +28,9 @@
 
 using namespace arm_compute;
 
-ICLSimpleFunction::ICLSimpleFunction()
-    : _kernel(), _border_handler()
+ICLSimpleFunction::ICLSimpleFunction() // NOLINT
+    : _kernel(),
+      _border_handler()
 {
 }
 
diff --git a/src/runtime/CL/functions/CLAbsoluteDifference.cpp b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
index 5097dd4..5613e6c 100644
--- a/src/runtime/CL/functions/CLAbsoluteDifference.cpp
+++ b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLAbsoluteDifference.h"
 
 #include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLAbsoluteDifference::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLAbsoluteDifferenceKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLAbsoluteDifferenceKernel>();
     k->configure(input1, input2, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLAccumulate.cpp b/src/runtime/CL/functions/CLAccumulate.cpp
index 56c5199..78f25fc 100644
--- a/src/runtime/CL/functions/CLAccumulate.cpp
+++ b/src/runtime/CL/functions/CLAccumulate.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLAccumulate.h"
 
 #include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,21 +32,21 @@
 
 void CLAccumulate::configure(const ICLTensor *input, ICLTensor *accum)
 {
-    auto k = arm_compute::cpp14::make_unique<CLAccumulateKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLAccumulateKernel>();
     k->configure(input, accum);
     _kernel = std::move(k);
 }
 
 void CLAccumulateWeighted::configure(const ICLTensor *input, float alpha, ICLTensor *accum)
 {
-    auto k = arm_compute::cpp14::make_unique<CLAccumulateWeightedKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLAccumulateWeightedKernel>();
     k->configure(input, alpha, accum);
     _kernel = std::move(k);
 }
 
 void CLAccumulateSquared::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum)
 {
-    auto k = arm_compute::cpp14::make_unique<CLAccumulateSquaredKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLAccumulateSquaredKernel>();
     k->configure(input, shift, accum);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index 9b5bd8b..b64739a 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -24,13 +24,13 @@
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 
 #include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-void CLActivationLayer::configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
 {
-    auto k = arm_compute::cpp14::make_unique<CLActivationLayerKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerKernel>();
     k->configure(input, output, act_info);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLArithmeticAddition.cpp b/src/runtime/CL/functions/CLArithmeticAddition.cpp
index 36bff42..5ca384d 100644
--- a/src/runtime/CL/functions/CLArithmeticAddition.cpp
+++ b/src/runtime/CL/functions/CLArithmeticAddition.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLArithmeticAddition.h"
 
 #include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLArithmeticAddition::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
 {
-    auto k = arm_compute::cpp14::make_unique<CLArithmeticAdditionKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticAdditionKernel>();
     k->configure(input1, input2, output, policy);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
index 97f0a1c..651f51a 100644
--- a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
+++ b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
 
 #include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLArithmeticSubtraction::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
 {
-    auto k = arm_compute::cpp14::make_unique<CLArithmeticSubtractionKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticSubtractionKernel>();
     k->configure(input1, input2, output, policy);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
index 3df673c..68cdaac 100644
--- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
@@ -37,7 +37,7 @@
 {
 }
 
-void CLBatchNormalizationLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon)
+void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon)
 {
     _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
 }
diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp
index 7c85043..f8a5a85 100644
--- a/src/runtime/CL/functions/CLBitwiseAnd.cpp
+++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
 
 #include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLBitwiseAnd::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLBitwiseAndKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLBitwiseAndKernel>();
     k->configure(input1, input2, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp
index 17ae5de..dc002e5 100644
--- a/src/runtime/CL/functions/CLBitwiseNot.cpp
+++ b/src/runtime/CL/functions/CLBitwiseNot.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLBitwiseNot.h"
 
 #include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLBitwiseNot::configure(const ICLTensor *input, ICLTensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLBitwiseNotKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLBitwiseNotKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp
index c84a279..4a10bb2 100644
--- a/src/runtime/CL/functions/CLBitwiseOr.cpp
+++ b/src/runtime/CL/functions/CLBitwiseOr.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLBitwiseOr.h"
 
 #include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLBitwiseOr::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLBitwiseOrKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLBitwiseOrKernel>();
     k->configure(input1, input2, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp
index fd49c7d..d23622a 100644
--- a/src/runtime/CL/functions/CLBitwiseXor.cpp
+++ b/src/runtime/CL/functions/CLBitwiseXor.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLBitwiseXor.h"
 
 #include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLBitwiseXor::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLBitwiseXorKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLBitwiseXorKernel>();
     k->configure(input1, input2, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLBox3x3.cpp b/src/runtime/CL/functions/CLBox3x3.cpp
index 8de6807..f28be44 100644
--- a/src/runtime/CL/functions/CLBox3x3.cpp
+++ b/src/runtime/CL/functions/CLBox3x3.cpp
@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLBox3x3.h"
 
 #include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,8 +33,8 @@
 
 void CLBox3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLBox3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLBox3x3Kernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
index 1d018b8..5acb8e7 100644
--- a/src/runtime/CL/functions/CLCannyEdge.cpp
+++ b/src/runtime/CL/functions/CLCannyEdge.cpp
@@ -26,17 +26,31 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
 #include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
 #include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-CLCannyEdge::CLCannyEdge()
-    : _sobel(nullptr), _gradient(), _border_mag_gradient(), _non_max_suppr(), _edge_trace(), _gx(), _gy(), _mag(), _phase(), _nonmax(), _visited(), _recorded(), _l1_list_counter(), _l1_stack()
+CLCannyEdge::CLCannyEdge(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _sobel(),
+      _gradient(),
+      _border_mag_gradient(),
+      _non_max_suppr(),
+      _edge_trace(),
+      _gx(),
+      _gy(),
+      _mag(),
+      _phase(),
+      _nonmax(),
+      _visited(),
+      _recorded(),
+      _l1_list_counter(),
+      _l1_stack()
 {
 }
 
@@ -83,22 +97,26 @@
     TensorInfo info_s32(shape_l1_stack, 1, arm_compute::DataType::S32);
     _l1_stack.allocator()->init(info_s32);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_gx);
+    _memory_group.manage(&_gy);
+
     // Configure/Init sobelNxN
     if(gradient_size == 3)
     {
-        auto k = arm_compute::cpp14::make_unique<CLSobel3x3>();
+        auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3>();
         k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
         _sobel = std::move(k);
     }
     else if(gradient_size == 5)
     {
-        auto k = arm_compute::cpp14::make_unique<CLSobel5x5>();
+        auto k = arm_compute::support::cpp14::make_unique<CLSobel5x5>();
         k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
         _sobel = std::move(k);
     }
     else if(gradient_size == 7)
     {
-        auto k = arm_compute::cpp14::make_unique<CLSobel7x7>();
+        auto k = arm_compute::support::cpp14::make_unique<CLSobel7x7>();
         k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
         _sobel = std::move(k);
     }
@@ -107,23 +125,43 @@
         ARM_COMPUTE_ERROR("Gradient %d size not supported", gradient_size);
     }
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_mag);
+    _memory_group.manage(&_phase);
+
     // Configure gradient
     _gradient.configure(&_gx, &_gy, &_mag, &_phase, norm_type);
 
+    // Allocate intermediate buffers
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+
+    // Manage intermediate buffers
+    _memory_group.manage(&_nonmax);
+
     // Configure non-maxima suppression
     _non_max_suppr.configure(&_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED);
 
+    // Allocate intermediate buffers
+    _phase.allocator()->allocate();
+
     // Fill border around magnitude image as non-maxima suppression will access
     // it. If border mode is undefined filling the border is a nop.
     _border_mag_gradient.configure(&_mag, _non_max_suppr.border_size(), border_mode, constant_border_value);
 
+    // Allocate intermediate buffers
+    _mag.allocator()->allocate();
+
+    // Manage intermediate buffers
+    _memory_group.manage(&_visited);
+    _memory_group.manage(&_recorded);
+    _memory_group.manage(&_l1_stack);
+    _memory_group.manage(&_l1_list_counter);
+
     // Configure edge tracing
     _edge_trace.configure(&_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter);
 
-    _gx.allocator()->allocate();
-    _gy.allocator()->allocate();
-    _phase.allocator()->allocate();
-    _mag.allocator()->allocate();
+    // Allocate intermediate buffers
     _visited.allocator()->allocate();
     _recorded.allocator()->allocate();
     _l1_stack.allocator()->allocate();
@@ -133,6 +171,8 @@
 
 void CLCannyEdge::run()
 {
+    _memory_group.acquire();
+
     // Run sobel
     _sobel->run();
 
@@ -152,4 +192,6 @@
     _l1_list_counter.clear(CLScheduler::get().queue());
     _l1_stack.clear(CLScheduler::get().queue());
     CLScheduler::get().enqueue(_edge_trace, true);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLChannelCombine.cpp b/src/runtime/CL/functions/CLChannelCombine.cpp
index 79a3676..11605cf 100644
--- a/src/runtime/CL/functions/CLChannelCombine.cpp
+++ b/src/runtime/CL/functions/CLChannelCombine.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLChannelCombine.h"
 
 #include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,14 +32,14 @@
 
 void CLChannelCombine::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLChannelCombineKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLChannelCombineKernel>();
     k->configure(plane0, plane1, plane2, plane3, output);
     _kernel = std::move(k);
 }
 
 void CLChannelCombine::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLChannelCombineKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLChannelCombineKernel>();
     k->configure(plane0, plane1, plane2, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLChannelExtract.cpp b/src/runtime/CL/functions/CLChannelExtract.cpp
index 2c6174b..5090382 100644
--- a/src/runtime/CL/functions/CLChannelExtract.cpp
+++ b/src/runtime/CL/functions/CLChannelExtract.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLChannelExtract.h"
 
 #include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,14 +32,14 @@
 
 void CLChannelExtract::configure(const ICLTensor *input, Channel channel, ICLTensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLChannelExtractKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLChannelExtractKernel>();
     k->configure(input, channel, output);
     _kernel = std::move(k);
 }
 
 void CLChannelExtract::configure(const ICLMultiImage *input, Channel channel, ICLImage *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLChannelExtractKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLChannelExtractKernel>();
     k->configure(input, channel, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLColorConvert.cpp b/src/runtime/CL/functions/CLColorConvert.cpp
index 2fe465a..65f8ac3 100644
--- a/src/runtime/CL/functions/CLColorConvert.cpp
+++ b/src/runtime/CL/functions/CLColorConvert.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLColorConvert.h"
 
 #include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,28 +32,28 @@
 
 void CLColorConvert::configure(const ICLTensor *input, ICLTensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
 
 void CLColorConvert::configure(const ICLImage *input, ICLMultiImage *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
 
 void CLColorConvert::configure(const ICLMultiImage *input, ICLImage *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
 
 void CLColorConvert::configure(const ICLMultiImage *input, ICLMultiImage *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp
index 21b5d47..a9b0867 100644
--- a/src/runtime/CL/functions/CLConvolution.cpp
+++ b/src/runtime/CL/functions/CLConvolution.cpp
@@ -26,13 +26,13 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -40,15 +40,15 @@
 
 void CLConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLConvolution3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLConvolution3x3Kernel>();
     k->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
 
 template <unsigned int matrix_size>
-CLConvolutionSquare<matrix_size>::CLConvolutionSquare()
-    : _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
+CLConvolutionSquare<matrix_size>::CLConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
 {
 }
 
@@ -66,6 +66,9 @@
         std::pair<DataType, DataType> type_pair = data_type_for_convolution(conv_col, conv_row, matrix_size);
         _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, type_pair.first));
 
+        // Manage intermediate buffers
+        _memory_group.manage(&_tmp);
+
         if(scale == 0)
         {
             scale = calculate_matrix_scale(conv, matrix_size);
@@ -92,8 +95,12 @@
 
     if(_is_separable)
     {
+        _memory_group.acquire();
+
         CLScheduler::get().enqueue(_kernel_hor, false);
         CLScheduler::get().enqueue(_kernel_vert);
+
+        _memory_group.release();
     }
     else
     {
@@ -107,7 +114,7 @@
 
 void CLConvolutionRectangle::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLConvolutionRectangleKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLConvolutionRectangleKernel>();
     k->configure(input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index f0bbc35..4b1bfd8 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -24,32 +24,31 @@
 #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
 
 #include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 #include <cmath>
+#include <memory>
 #include <tuple>
 
 using namespace arm_compute;
 
-CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights()
-    : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
+CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
 {
 }
 
 void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose1xW)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
 
     if(biases != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
         ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
         ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
@@ -65,10 +64,12 @@
         const unsigned int mat_weights_cols = weights->info()->dimension(3);
         const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
         TensorShape        shape_wr(mat_weights_cols, mat_weights_rows);
-        const DataType     dt = weights->info()->data_type();
-        TensorInfo         info_wr(shape_wr, 1, dt);
+        const DataType     dt                   = weights->info()->data_type();
+        const int          fixed_point_position = weights->info()->fixed_point_position();
+        TensorInfo         info_wr(shape_wr, 1, dt, fixed_point_position);
 
         _weights_reshaped.allocator()->init(info_wr);
+        _memory_group.manage(&_weights_reshaped);
         _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
         _weights_transposed_kernel.configure(&_weights_reshaped, output);
         _weights_reshaped.allocator()->allocate();
@@ -81,41 +82,50 @@
 
 void CLConvolutionLayerReshapeWeights::run()
 {
+    _memory_group.acquire();
+
     cl::CommandQueue q = CLScheduler::get().queue();
     CLScheduler::get().enqueue(_weights_reshape_kernel);
     if(_transpose1xW)
     {
         CLScheduler::get().enqueue(_weights_transposed_kernel);
     }
+
+    _memory_group.release();
 }
 
-CLConvolutionLayer::CLConvolutionLayer()
-    : _reshape_weights(), _input_im2col_kernel(), _input_interleave_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(),
-      _weights_transposed(), _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reshape_weights(), _input_im2col_kernel(), _input_interleave_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(),
+      _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
 {
 }
 
 void CLConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
     ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
 
     if(biases != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
         ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
         ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
     }
 
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    // Set the GPU target for matrix multiply
+    _mm_kernel.set_target(CLScheduler::get().target());
+
     _has_bias             = (biases != nullptr);
     _are_weights_reshaped = weights_info.are_reshaped();
 
-    // Get parameters for conv_info
+    // Get parameters from conv_info
     unsigned int stride_x = 0;
     unsigned int stride_y = 0;
     unsigned int pad_x    = 0;
@@ -127,20 +137,21 @@
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
 
-    const unsigned int kernel_width = _are_weights_reshaped ? weights_info.kernel_size() : weights->info()->dimension(0);
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width,
-                                                 stride_x, stride_y, pad_x, pad_y, conv_info.round());
-    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+    const unsigned int kernel_width  = (_are_weights_reshaped) ? weights_info.kernel_size().first : weights->info()->dimension(0);
+    const unsigned int kernel_height = (_are_weights_reshaped) ? weights_info.kernel_size().second : weights->info()->dimension(1);
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
+                                                 conv_info);
 
     // Check if its a "fully connected" convolution
     _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
 
-    // Create tensor to store the reshaped weights
-    size_t mat_weights_cols = weights->info()->dimension(3);
-    size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
+    unsigned int mat_weights_cols = weights->info()->dimension(3);
+    unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+
+    // Reshape weights if needed
     if(_are_weights_reshaped)
     {
-        mat_weights_cols                         = output->info()->dimension(2);
+        mat_weights_cols                         = weights_info.num_kernels();
         const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
         mat_weights_rows                         = (_has_bias ? 1 + quarter_reshaped_cols : quarter_reshaped_cols);
     }
@@ -150,77 +161,75 @@
         {
             // Create tensor to store the reshaped weights
             TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
-            TensorInfo  info_wr(shape_wr, 1, weights->info()->data_type());
+            TensorInfo  info_wr(shape_wr, 1, dt, fixed_point_position);
             _weights_reshaped.allocator()->init(info_wr);
-            _reshape_weights.configure(weights, biases, &_weights_reshaped, false);
-            weights = &_weights_reshaped;
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
         }
         else
         {
             // Create tensor to store transposed weights
-            TensorShape shape_wt(mat_weights_rows * 4, static_cast<size_t>(std::ceil(mat_weights_cols / 4.f)));
-            TensorInfo  info_wt(shape_wt, 1, weights->info()->data_type());
-            _weights_transposed.allocator()->init(info_wt);
-            _reshape_weights.configure(weights, biases, &_weights_transposed, true);
-            weights = &_weights_transposed;
+            const float transpose_width = 16.0f / input->info()->element_size();
+            TensorShape shape_wt(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
+            TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+            _weights_reshaped.allocator()->init(info_wt);
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, true /* 1xW transpose */);
         }
+        weights = &_weights_reshaped;
     }
+
     // Create tensor to store im2col reshaped inputs
-    const size_t mat_input_cols = mat_weights_rows;
-    const size_t mat_input_rows = conv_w * conv_h;
-    TensorShape  shape_im2col   = input->info()->tensor_shape();
+    const unsigned int mat_input_cols = mat_weights_rows;
+    const unsigned int mat_input_rows = conv_w * conv_h;
+    TensorShape        shape_im2col   = input->info()->tensor_shape();
     shape_im2col.set(0, mat_input_cols);
     shape_im2col.set(1, mat_input_rows);
     shape_im2col.set(2, 1);
-    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+    _memory_group.manage(&_input_im2col_reshaped);
 
     // Create tensor (interleave) to prepare input tensor for GEMM
     if(!_is_fully_connected_convolution)
     {
         TensorShape shape_interleaved = shape_im2col;
         shape_interleaved.set(0, shape_interleaved.x() * 4);
-        shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4.f));
-        _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
+        shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
+        _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+        _memory_group.manage(&_input_interleaved_reshaped);
     }
 
     // Create GEMM output tensor
     TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
     shape_gemm.set(0, mat_weights_cols);
     shape_gemm.set(1, mat_input_rows);
-    _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
+    _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, dt, fixed_point_position));
+    _memory_group.manage(&_gemm_output);
 
     // Configure kernels
-    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
-    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
 
+    // Configure matrix multiply
     if(_is_fully_connected_convolution)
     {
-        _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
+        // The matrix A and Matrix B have not been reshaped
+        _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f, false);
     }
     else
     {
         _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
         _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
-    }
-
-    if(!_are_weights_reshaped)
-    {
-        if(!_is_fully_connected_convolution)
-        {
-            _weights_transposed.allocator()->allocate();
-        }
-        else
-        {
-            _weights_reshaped.allocator()->allocate();
-        }
-    }
-
-    _input_im2col_reshaped.allocator()->allocate();
-    if(!_is_fully_connected_convolution)
-    {
         _input_interleaved_reshaped.allocator()->allocate();
     }
+    _input_im2col_reshaped.allocator()->allocate();
+    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
     _gemm_output.allocator()->allocate();
+
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+
+    // Allocate intermediate tensor
+    if(!_are_weights_reshaped)
+    {
+        _weights_reshaped.allocator()->allocate();
+    }
 }
 
 void CLConvolutionLayer::run()
@@ -232,6 +241,8 @@
         _reshape_weights.run();
     }
 
+    _memory_group.acquire();
+
     // Run input reshaping
     CLScheduler::get().enqueue(_input_im2col_kernel);
     if(!_is_fully_connected_convolution)
@@ -244,4 +255,6 @@
 
     // Reshape output matrix
     CLScheduler::get().enqueue(_output_col2im_kernel, false);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLDepthConcatenate.cpp b/src/runtime/CL/functions/CLDepthConcatenate.cpp
index d967d98..f42627f 100644
--- a/src/runtime/CL/functions/CLDepthConcatenate.cpp
+++ b/src/runtime/CL/functions/CLDepthConcatenate.cpp
@@ -24,22 +24,23 @@
 #include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-CLDepthConcatenate::CLDepthConcatenate()
-    : _inputs_vector(), _concat_kernels_vector(), _border_handlers_vector(), _num_inputs(0)
+CLDepthConcatenate::CLDepthConcatenate() // NOLINT
+    : _inputs_vector(),
+      _concat_kernels_vector(),
+      _border_handlers_vector(),
+      _num_inputs(0)
 {
 }
 
-void CLDepthConcatenate::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output)
+void CLDepthConcatenate::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output) // NOLINT
 {
     ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
 
@@ -47,8 +48,8 @@
 
     unsigned int depth_offset = 0;
 
-    _concat_kernels_vector  = arm_compute::cpp14::make_unique<CLDepthConcatenateKernel[]>(_num_inputs);
-    _border_handlers_vector = arm_compute::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
+    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<CLDepthConcatenateKernel[]>(_num_inputs);
+    _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
 
     for(unsigned int i = 0; i < _num_inputs; i++)
     {
diff --git a/src/runtime/CL/functions/CLDepthConvert.cpp b/src/runtime/CL/functions/CLDepthConvert.cpp
index edcd492..b64d05b 100644
--- a/src/runtime/CL/functions/CLDepthConvert.cpp
+++ b/src/runtime/CL/functions/CLDepthConvert.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
 
 #include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLDepthConvert::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
 {
-    auto k = arm_compute::cpp14::make_unique<CLDepthConvertKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertKernel>();
     k->configure(input, output, policy, shift);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolution.cpp b/src/runtime/CL/functions/CLDepthwiseConvolution.cpp
new file mode 100644
index 0000000..22c037f
--- /dev/null
+++ b/src/runtime/CL/functions/CLDepthwiseConvolution.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthwiseConvolution.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLDepthwiseConvolution3x3::CLDepthwiseConvolution3x3()
+    : _kernel(), _border_handler()
+{
+}
+
+void CLDepthwiseConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *weights, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+
+    _kernel.configure(input, output, weights, conv_info);
+    _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+}
+
+void CLDepthwiseConvolution3x3::run()
+{
+    CLScheduler::get().enqueue(_border_handler);
+    CLScheduler::get().enqueue(_kernel);
+}
+
+CLDepthwiseConvolution::CLDepthwiseConvolution()
+    : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(), _weights_reshaped(),
+      _v2mm_output()
+{
+}
+
+void CLDepthwiseConvolution::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *weights, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
+
+    const size_t weights_w = weights->info()->dimension(0);
+    const size_t weights_h = weights->info()->dimension(1);
+    const size_t weights_z = weights->info()->dimension(2);
+
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights_w, weights_h, conv_info);
+
+    // Set up intermediate tensors
+    const size_t patch_size = weights_w * weights_h;
+    const size_t conv_size  = conv_w * conv_h;
+
+    TensorShape shape_im2col = input->info()->tensor_shape();
+    shape_im2col.set(0, patch_size);
+    shape_im2col.set(1, conv_size);
+    shape_im2col.set(2, weights_z);
+
+    const TensorShape shape_weights_reshape(patch_size, weights_z);
+    TensorShape       shape_v2mm_out = output->info()->tensor_shape();
+    shape_v2mm_out.set(0, conv_size * weights_z);
+    shape_v2mm_out.set(1, 1);
+    shape_v2mm_out.set(2, 1);
+
+    const TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    const TensorInfo info_weights_reshape(shape_weights_reshape, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
+    const TensorInfo info_v2mm_out(shape_v2mm_out, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    _input_reshaped.allocator()->init(info_im2col);
+    _weights_reshaped.allocator()->init(info_weights_reshape);
+    _v2mm_output.allocator()->init(info_v2mm_out);
+
+    // Configure kernels
+    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info);
+    _weights_reshape_kernel.configure(weights, &_weights_reshaped);
+    _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
+    _vector_to_tensor_kernel.configure(&_v2mm_output, output, conv_w, conv_h);
+
+    BorderSize border_size = _v2mm_kernel.border_size();
+    _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, PixelValue(0));
+
+    border_size.bottom = 0;
+    _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, PixelValue(0));
+
+    // Allocate intermediate tensors
+    _input_reshaped.allocator()->allocate();
+    _weights_reshaped.allocator()->allocate();
+    _v2mm_output.allocator()->allocate();
+}
+
+void CLDepthwiseConvolution::run()
+{
+    CLScheduler::get().enqueue(_im2col_kernel);
+
+    CLScheduler::get().enqueue(_weights_reshape_kernel);
+
+    CLScheduler::get().enqueue(_v2mm_input_fill_border);
+    CLScheduler::get().enqueue(_v2mm_weights_fill_border);
+    CLScheduler::get().enqueue(_v2mm_kernel);
+
+    CLScheduler::get().enqueue(_vector_to_tensor_kernel);
+}
diff --git a/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
new file mode 100644
index 0000000..c325b3e
--- /dev/null
+++ b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLDepthwiseSeparableConvolutionLayer::CLDepthwiseSeparableConvolutionLayer()
+    : _depthwise_conv(), _pointwise_conv()
+{
+}
+
+void CLDepthwiseSeparableConvolutionLayer::configure(ICLTensor *input, const ICLTensor *depthwise_weights, ICLTensor *depthwise_out, const ICLTensor *pointwise_weights, const ICLTensor *biases,
+                                                     ICLTensor           *output,
+                                                     const PadStrideInfo &depthwise_conv_info, const PadStrideInfo &pointwise_conv_info)
+{
+    _depthwise_conv.configure(input, depthwise_out, depthwise_weights, depthwise_conv_info);
+    _pointwise_conv.configure(depthwise_out, pointwise_weights, biases, output, pointwise_conv_info);
+}
+
+void CLDepthwiseSeparableConvolutionLayer::run()
+{
+    _depthwise_conv.run();
+    _pointwise_conv.run();
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
new file mode 100644
index 0000000..5559d42
--- /dev/null
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLDequantizationLayer::CLDequantizationLayer()
+    : _dequantize_kernel()
+{
+}
+
+void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *min_max)
+{
+    _dequantize_kernel.configure(input, output, min_max);
+}
+
+void CLDequantizationLayer::run()
+{
+    // Run dequantization kernel
+    CLScheduler::get().enqueue(_dequantize_kernel, false);
+}
diff --git a/src/runtime/CL/functions/CLDerivative.cpp b/src/runtime/CL/functions/CLDerivative.cpp
index c51cb4c..ae49996 100644
--- a/src/runtime/CL/functions/CLDerivative.cpp
+++ b/src/runtime/CL/functions/CLDerivative.cpp
@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLDerivative.h"
 
 #include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,8 +33,8 @@
 
 void CLDerivative::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLDerivativeKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLDerivativeKernel>();
     k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLDilate.cpp b/src/runtime/CL/functions/CLDilate.cpp
index 345f477..59c5ea5 100644
--- a/src/runtime/CL/functions/CLDilate.cpp
+++ b/src/runtime/CL/functions/CLDilate.cpp
@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLDilate.h"
 
 #include "arm_compute/core/CL/kernels/CLDilateKernel.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,8 +33,8 @@
 
 void CLDilate::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLDilateKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLDilateKernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
new file mode 100644
index 0000000..6fafd9c
--- /dev/null
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLDirectConvolutionLayer::CLDirectConvolutionLayer()
+    : _direct_conv_kernel(), _input_border_handler()
+{
+}
+
+void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+    // Set GPU target
+    _direct_conv_kernel.set_target(CLScheduler::get().target());
+
+    // Configure direct convolution
+    _direct_conv_kernel.configure(input, weights, biases, output, conv_info);
+
+    // Configure border handler
+    _input_border_handler.configure(input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+}
+
+void CLDirectConvolutionLayer::run()
+{
+    // Run border handler
+    CLScheduler::get().enqueue(_input_border_handler, false);
+
+    // Run direct convolution
+    CLScheduler::get().enqueue(_direct_conv_kernel);
+}
diff --git a/src/runtime/CL/functions/CLErode.cpp b/src/runtime/CL/functions/CLErode.cpp
index b4c50e4..eb1f6e4 100644
--- a/src/runtime/CL/functions/CLErode.cpp
+++ b/src/runtime/CL/functions/CLErode.cpp
@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLErode.h"
 
 #include "arm_compute/core/CL/kernels/CLErodeKernel.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,8 +33,8 @@
 
 void CLErode::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLErodeKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLErodeKernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp
index d2903fb..7a0dd09 100644
--- a/src/runtime/CL/functions/CLFastCorners.cpp
+++ b/src/runtime/CL/functions/CLFastCorners.cpp
@@ -36,8 +36,9 @@
 
 using namespace arm_compute;
 
-CLFastCorners::CLFastCorners()
-    : _fast_corners_kernel(),
+CLFastCorners::CLFastCorners(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)),
+      _fast_corners_kernel(),
       _suppr_func(),
       _copy_array_kernel(),
       _output(),
@@ -70,6 +71,7 @@
 
     const bool update_number = (nullptr != _num_corners);
 
+    _memory_group.manage(&_output);
     _fast_corners_kernel.configure(input, &_output, threshold, nonmax_suppression, border_mode);
 
     if(!_non_max)
@@ -79,6 +81,7 @@
     else
     {
         _suppr.allocator()->init(tensor_info);
+        _memory_group.manage(&_suppr);
 
         _suppr_func.configure(&_output, &_suppr, border_mode);
         _copy_array_kernel.configure(&_suppr, update_number, corners, &_num_buffer);
@@ -94,6 +97,8 @@
 {
     cl::CommandQueue q = CLScheduler::get().queue();
 
+    _memory_group.acquire();
+
     if(_non_max)
     {
         ARM_COMPUTE_ERROR_ON_MSG(_output.cl_buffer().get() == nullptr, "Unconfigured function");
@@ -124,4 +129,6 @@
     }
 
     q.flush();
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLFillBorder.cpp b/src/runtime/CL/functions/CLFillBorder.cpp
index 9e59b77..54c096e 100644
--- a/src/runtime/CL/functions/CLFillBorder.cpp
+++ b/src/runtime/CL/functions/CLFillBorder.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLFillBorder.h"
 
 #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLFillBorder::configure(ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLFillBorderKernel>();
-    k->configure(tensor, border_width, border_mode, constant_border_value);
+    auto k = arm_compute::support::cpp14::make_unique<CLFillBorderKernel>();
+    k->configure(tensor, BorderSize(border_width), border_mode, constant_border_value);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp
new file mode 100644
index 0000000..9f571b2
--- /dev/null
+++ b/src/runtime/CL/functions/CLFlattenLayer.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
+#include "arm_compute/core/Size2D.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLFlattenLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLIm2ColKernel>();
+    k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp
new file mode 100644
index 0000000..364db34
--- /dev/null
+++ b/src/runtime/CL/functions/CLFloor.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFloor.h"
+
+#include "arm_compute/core/CL/kernels/CLFloorKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLFloor::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLFloorKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 57d57d5..ee1558f 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -23,88 +23,31 @@
  */
 #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
 
+#include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
 
 #include <algorithm>
-#include <cmath>
 
 using namespace arm_compute;
 
-CLFullyConnectedLayerReshapeWeights::CLFullyConnectedLayerReshapeWeights()
-    : _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
+void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _reshape_weights_output(),
+      _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false)
 {
 }
 
-void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output, bool transpose_weights, bool is_batched_fc_layer)
+void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(output == nullptr);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != 2);
-    ARM_COMPUTE_ERROR_ON((transpose_weights == false) && (is_batched_fc_layer == false));
-
-    const DataType dt                   = input->info()->data_type();
-    const int      fixed_point_position = input->info()->fixed_point_position();
-
-    _transpose_weights   = transpose_weights;
-    _is_batched_fc_layer = is_batched_fc_layer;
-
-    // Check if we need to transpose the weights
-    if(_transpose_weights)
-    {
-        if(_is_batched_fc_layer)
-        {
-            // Initialize the output tensor for transpose
-            TensorShape shape_transposed(input->info()->dimension(1), input->info()->dimension(0));
-            _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, dt, fixed_point_position));
-            _transpose_kernel.configure(input, &_transpose_output);
-
-            // Configure transpose 1xW kernel
-            _transpose1xW_kernel.configure(&_transpose_output, output);
-
-            // Allocate temporary tensor used for transposing the weights
-            _transpose_output.allocator()->allocate();
-        }
-        else
-        {
-            _transpose_kernel.configure(input, output);
-        }
-    }
-    else
-    {
-        if(_is_batched_fc_layer)
-        {
-            // Configure transpose 1xW kernel
-            _transpose1xW_kernel.configure(input, output);
-        }
-        else
-        {
-            ARM_COMPUTE_ERROR("Configuration transpose_weights=false & is_batched_fc_layer=false not supported");
-        }
-    }
-}
-
-void CLFullyConnectedLayerReshapeWeights::run()
-{
-    if(_transpose_weights)
-    {
-        CLScheduler::get().enqueue(_transpose_kernel, _is_batched_fc_layer);
-    }
-    if(_is_batched_fc_layer)
-    {
-        CLScheduler::get().enqueue(_transpose1xW_kernel);
-    }
-}
-
-CLFullyConnectedLayer::CLFullyConnectedLayer()
-    : _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _reshape_weights_output(),
-      _are_weights_reshaped(true), _is_fc_after_conv(true), _is_batched_fc_layer(false), _accumulate_biases(false)
-{
-}
-
-void CLFullyConnectedLayer::configure_conv_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2) * (16 / weights->info()->element_size())));
+    ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
 
     const DataType dt                   = input->info()->data_type();
     const int      fixed_point_position = input->info()->fixed_point_position();
@@ -119,93 +62,33 @@
     shape_im2col.set(3, input->info()->dimension(5));
     _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
 
-    // Initialize output tensor for interleave 4x4
-    TensorShape shape_interleaved = _im2col_output.info()->tensor_shape();
-    shape_interleaved.set(0, shape_interleaved.x() * 4);
-    shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
-    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
-
     // Configure im2col kernel
-    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
-
-    // Configure interleave4x4 kernel
-    _interleave4x4_kernel.configure(&_im2col_output, &_interleave4x4_output);
+    _memory_group.manage(&_im2col_output);
+    _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
 
     // Configure matrix multiply kernel
-    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
-
-    // Allocate the tensors once all the configure methods have been called
-    _im2col_output.allocator()->allocate();
-    _interleave4x4_output.allocator()->allocate();
-}
-
-void CLFullyConnectedLayer::configure_fc_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
-{
-    const DataType dt                   = input->info()->data_type();
-    const int      fixed_point_position = input->info()->fixed_point_position();
-
-    // Initialize output tensor for interleave 4x4
-    TensorShape shape_interleaved = input->info()->tensor_shape();
-    shape_interleaved.set(0, shape_interleaved.x() * 4);
-    shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
-    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
-
-    // Configure interleave4x4 kernel
-    _interleave4x4_kernel.configure(input, &_interleave4x4_output);
-
-    // Configure matrix multiply kernel
-    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
-
-    // Allocate the tensors once all the configure methods have been called
-    _interleave4x4_output.allocator()->allocate();
-}
-
-void CLFullyConnectedLayer::configure_conv_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
-
-    const DataType dt                   = input->info()->data_type();
-    const int      fixed_point_position = input->info()->fixed_point_position();
-
-    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
-
-    // Initialize output tensor for im2col
-    TensorShape shape_im2col;
-    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
-    shape_im2col.set(1, 1);
-    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
-
-    // Configure im2col kernel
-    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
-
-    // Configure matrix multiply kernel
-    _mm_kernel.configure(&_im2col_output, weights, output, 1.0f);
+    _mm_kernel.configure(&_im2col_output, weights, output, 1.0f, false);
 
     // Allocate the output tensor for im2col once all the configure methods have been called
     _im2col_output.allocator()->allocate();
 }
 
-void CLFullyConnectedLayer::configure_fc_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
+void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
 {
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
 
     // Configure matrix multiply kernel
-    _mm_kernel.configure(input, weights, output, 1.0f);
+    _mm_kernel.configure(input, weights, output, 1.0f, false);
 }
 
 void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights, bool are_weights_reshaped)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
 
-    const DataType dt                   = input->info()->data_type();
-    const int      fixed_point_position = input->info()->fixed_point_position();
-
-    _are_weights_reshaped = are_weights_reshaped;
+    _are_weights_reshaped = transpose_weights ? are_weights_reshaped : true;
     _is_fc_after_conv     = true;
-    _is_batched_fc_layer  = false;
     _accumulate_biases    = false;
 
     if(biases != nullptr)
@@ -224,90 +107,46 @@
     //  3) Convolution layer -> Fully Connected layer with batches
     //  4) Fully Connected layer -> Fully Connected layer with batches
 
-    // Check if we have a fully connected layer with batches
-    _is_batched_fc_layer = (output->info()->dimension(1) > 1);
-
     const ICLTensor *weights_to_use = weights;
 
-    if(!are_weights_reshaped)
+    if(!_are_weights_reshaped)
     {
-        if((transpose_weights || _is_batched_fc_layer))
-        {
-            weights_to_use = &_reshape_weights_output;
+        weights_to_use = &_reshape_weights_output;
 
-            if(transpose_weights)
-            {
-                if(_is_batched_fc_layer)
-                {
-                    const float transpose_width = 16.0f / input->info()->element_size();
-                    TensorShape shape_wt(weights->info()->dimension(0) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(1) / transpose_width)));
-                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
-                    _reshape_weights_output.allocator()->init(info_wt);
-                }
-                else
-                {
-                    TensorShape shape_wt(weights->info()->dimension(1), weights->info()->dimension(0));
-                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
-                    _reshape_weights_output.allocator()->init(info_wt);
-                }
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR_ON(!_is_batched_fc_layer);
-
-                const float transpose_width = 16.0f / input->info()->element_size();
-                TensorShape shape_wt(weights->info()->dimension(1) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(0) / transpose_width)));
-                TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
-                _reshape_weights_output.allocator()->init(info_wt);
-            }
-
-            // Reshape the weights
-            _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
-        }
+        // Reshape the weights
+        _reshape_weights_kernel.configure(weights, &_reshape_weights_output);
     }
 
-    if(_is_batched_fc_layer)
+    // Check if we have a fully connected layer with batches
+    const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
+
+    if(is_batched_fc_layer)
     {
         _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
                                                                                   input->info()->tensor_shape().cend(),
                                                                                   output->info()->tensor_shape().cbegin() + 1));
-
-        if(_is_fc_after_conv)
-        {
-            // Fully Connected layer after a Convolution Layer with batches
-            configure_conv_fc_wb(input, weights_to_use, output);
-        }
-        else
-        {
-            // Fully Connected layer after a Fully Connected Layer with batches
-            configure_fc_fc_wb(input, weights_to_use, output);
-        }
     }
     else
     {
-        // In case of not batched fully connected layer, the weights will not be reshaped using transposed1xW
-        _is_fc_after_conv = ((weights_to_use->info()->dimension(1)) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+        _is_fc_after_conv = input->info()->num_dimensions() > 1;
+    }
 
-        if(_is_fc_after_conv)
-        {
-            // Fully Connected layer after a Convolution Layer without batches
-            configure_conv_fc_nb(input, weights_to_use, output);
-        }
-        else
-        {
-            // Fully Connected layer after a Fully Connected Layer without batches
-            configure_fc_fc_nb(input, weights_to_use, output);
-        }
+    if(_is_fc_after_conv)
+    {
+        // Fully Connected layer after a Convolution Layer without batches
+        configure_conv_fc(input, weights_to_use, output);
+    }
+    else
+    {
+        // Fully Connected layer after a Fully Connected Layer without batches
+        configure_fc_fc(input, weights_to_use, output);
     }
 
     // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
-    if(!are_weights_reshaped)
+    if(!_are_weights_reshaped)
     {
-        if(transpose_weights || _is_batched_fc_layer)
-        {
-            // Allocate the tensor for the weights reshaped
-            _reshape_weights_output.allocator()->allocate();
-        }
+        // Allocate the tensor for the weights reshaped
+        _reshape_weights_output.allocator()->allocate();
     }
 }
 
@@ -320,18 +159,14 @@
         _reshape_weights_kernel.run();
     }
 
+    _memory_group.acquire();
+
     // Linearize input if it comes from a convolutional layer
     if(_is_fc_after_conv)
     {
         CLScheduler::get().enqueue(_im2col_kernel, false);
     }
 
-    // Interleave input
-    if(_is_batched_fc_layer)
-    {
-        CLScheduler::get().enqueue(_interleave4x4_kernel, false);
-    }
-
     // Run matrix multiply
     CLScheduler::get().enqueue(_mm_kernel, !_accumulate_biases);
 
@@ -340,4 +175,6 @@
     {
         CLScheduler::get().enqueue(_accumulate_biases_kernel);
     }
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 7408054..a81d113 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -38,20 +38,18 @@
 
 using namespace arm_compute;
 
-CLGEMM::CLGEMM()
-    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _run_vector_matrix_multiplication(false), _run_addition(false)
+CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false)
 {
 }
 
 void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
 
     if(c != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16);
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
         ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
         ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
@@ -59,13 +57,18 @@
         ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
     }
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
     ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
 
-    // Check if the first input tensor is a vector. If so, all the kernels for reshaping the tensors can be skipped
-    if(a->info()->dimension(1) != 1)
+    // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors
+    _is_interleaved_transposed = a->info()->dimension(1) > 16;
+
+    const ICLTensor *matrix_a = a;
+    const ICLTensor *matrix_b = b;
+
+    if(_is_interleaved_transposed)
     {
-        _run_vector_matrix_multiplication = false;
+        matrix_a = &_tmp_a;
+        matrix_b = &_tmp_b;
 
         TensorShape shape_tmp_a = a->info()->tensor_shape();
         TensorShape shape_tmp_b = b->info()->tensor_shape();
@@ -73,27 +76,20 @@
         shape_tmp_a.set(0, a->info()->dimension(0) * 4);
         shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
 
-        if(DataType::F32 == a->info()->data_type())
-        {
-            shape_tmp_b.set(0, b->info()->dimension(1) * 4);
-            shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 4.0f));
-        }
-        else if(DataType::F16 == a->info()->data_type())
-        {
-            shape_tmp_b.set(0, b->info()->dimension(1) * 8);
-            shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 8.0f));
-        }
-        else
-        {
-            ARM_COMPUTE_ERROR("DataType not supported");
-        }
+        const unsigned int transpose_w = max_cl_vector_width / data_size_from_type(b->info()->data_type());
+        shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
+        shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
 
-        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
         _tmp_a.allocator()->init(info_a);
 
-        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), b->info()->fixed_point_position());
         _tmp_b.allocator()->init(info_b);
 
+        // Manage intermediate buffers
+        _memory_group.manage(&_tmp_a);
+        _memory_group.manage(&_tmp_b);
+
         // Configure interleave kernel
         _interleave_kernel.configure(a, &_tmp_a);
 
@@ -101,19 +97,17 @@
         _transpose_kernel.configure(b, &_tmp_b);
 
         // Configure matrix multiply kernel
-        _mm_kernel.configure(&_tmp_a, &_tmp_b, output, alpha);
+        _mm_kernel.set_target(CLScheduler::get().target());
+    }
 
+    _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed);
+
+    if(_is_interleaved_transposed)
+    {
         // Allocate intermediate tensors
         _tmp_a.allocator()->allocate();
         _tmp_b.allocator()->allocate();
     }
-    else // The first input tensor is a vector
-    {
-        _run_vector_matrix_multiplication = true;
-
-        // Configure the matrix multiply kernel
-        _mm_kernel.configure(a, b, output, alpha);
-    }
 
     // Configure matrix addition kernel
     if(beta != 0 && c != nullptr)
@@ -125,7 +119,9 @@
 
 void CLGEMM::run()
 {
-    if(!_run_vector_matrix_multiplication)
+    _memory_group.acquire();
+
+    if(_is_interleaved_transposed)
     {
         // Run interleave kernel
         CLScheduler::get().enqueue(_interleave_kernel, false);
@@ -142,4 +138,6 @@
     {
         CLScheduler::get().enqueue(_ma_kernel);
     }
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp b/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
index 9dc7715..45547e4 100644
--- a/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
+++ b/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
@@ -24,13 +24,13 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h"
 
 #include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 void CLGEMMInterleave4x4::configure(const ICLTensor *input, ICLTensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLGEMMInterleave4x4Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLGEMMInterleave4x4Kernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLGEMMLowp.cpp b/src/runtime/CL/functions/CLGEMMLowp.cpp
index 45e011d..db6d11c 100644
--- a/src/runtime/CL/functions/CLGEMMLowp.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowp.cpp
@@ -33,8 +33,8 @@
 
 using namespace arm_compute;
 
-CLGEMMLowp::CLGEMMLowp()
-    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
+CLGEMMLowp::CLGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
 {
 }
 
@@ -62,6 +62,10 @@
     TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
     _tmp_b.allocator()->init(info_b);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_tmp_a);
+    _memory_group.manage(&_tmp_b);
+
     // Configure kernels
     _interleave_kernel.configure(a, &_tmp_a);
     _transpose_kernel.configure(b, &_tmp_b);
@@ -74,6 +78,8 @@
 
 void CLGEMMLowp::run()
 {
+    _memory_group.acquire();
+
     /* Run interleave kernel */
     CLScheduler::get().enqueue(_interleave_kernel, false);
 
@@ -82,4 +88,6 @@
 
     /* Run matrix multiply kernel */
     CLScheduler::get().enqueue(_mm_kernel, false);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLGEMMTranspose1xW.cpp b/src/runtime/CL/functions/CLGEMMTranspose1xW.cpp
new file mode 100644
index 0000000..d054e01
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMMTranspose1xW.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLGEMMTranspose1xW::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLGEMMTranspose1xWKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLGaussian3x3.cpp b/src/runtime/CL/functions/CLGaussian3x3.cpp
index 362a3fe..7ebabd7 100644
--- a/src/runtime/CL/functions/CLGaussian3x3.cpp
+++ b/src/runtime/CL/functions/CLGaussian3x3.cpp
@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLGaussian3x3.h"
 
 #include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void CLGaussian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLGaussian3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLGaussian3x3Kernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLGaussian5x5.cpp b/src/runtime/CL/functions/CLGaussian5x5.cpp
index e83a8fb..f30eee1 100644
--- a/src/runtime/CL/functions/CLGaussian5x5.cpp
+++ b/src/runtime/CL/functions/CLGaussian5x5.cpp
@@ -35,8 +35,8 @@
 
 using namespace arm_compute;
 
-CLGaussian5x5::CLGaussian5x5()
-    : _kernel_hor(), _kernel_vert(), _border_handler(), _tmp()
+CLGaussian5x5::CLGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _kernel_hor(), _kernel_vert(), _border_handler(), _tmp()
 {
 }
 
@@ -46,6 +46,10 @@
 
     _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, DataType::U16));
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_tmp);
+
+    // Configure kernels
     _kernel_hor.configure(input, &_tmp, border_mode == BorderMode::UNDEFINED);
     _kernel_vert.configure(&_tmp, output, border_mode == BorderMode::UNDEFINED);
     _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
@@ -57,6 +61,11 @@
 void CLGaussian5x5::run()
 {
     CLScheduler::get().enqueue(_border_handler, false);
+
+    _memory_group.acquire();
+
     CLScheduler::get().enqueue(_kernel_hor, false);
     CLScheduler::get().enqueue(_kernel_vert);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
index 8a4279e..8436dce 100644
--- a/src/runtime/CL/functions/CLGaussianPyramid.cpp
+++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp
@@ -27,11 +27,11 @@
 #include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
 #include "arm_compute/core/CL/kernels/CLScaleKernel.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
 
 #include "arm_compute/runtime/CL/CLPyramid.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
@@ -48,8 +48,10 @@
 {
 }
 
-CLGaussianPyramidHalf::CLGaussianPyramidHalf()
-    : _border_handler(), _horizontal_reduction(), _vertical_reduction()
+CLGaussianPyramidHalf::CLGaussianPyramidHalf() // NOLINT
+    : _border_handler(),
+      _horizontal_reduction(),
+      _vertical_reduction()
 {
 }
 
@@ -70,9 +72,9 @@
 
     if(num_levels > 1)
     {
-        _border_handler       = arm_compute::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
-        _horizontal_reduction = arm_compute::cpp14::make_unique<CLGaussianPyramidHorKernel[]>(num_levels - 1);
-        _vertical_reduction   = arm_compute::cpp14::make_unique<CLGaussianPyramidVertKernel[]>(num_levels - 1);
+        _border_handler       = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
+        _horizontal_reduction = arm_compute::support::cpp14::make_unique<CLGaussianPyramidHorKernel[]>(num_levels - 1);
+        _vertical_reduction   = arm_compute::support::cpp14::make_unique<CLGaussianPyramidVertKernel[]>(num_levels - 1);
 
         // Apply half scale to the X dimension of the tensor shape
         TensorShape tensor_shape = pyramid->info()->tensor_shape();
@@ -119,8 +121,9 @@
     }
 }
 
-CLGaussianPyramidOrb::CLGaussianPyramidOrb()
-    : _gauss5x5(), _scale_nearest()
+CLGaussianPyramidOrb::CLGaussianPyramidOrb() // NOLINT
+    : _gauss5x5(),
+      _scale_nearest()
 {
 }
 
@@ -141,8 +144,8 @@
 
     if(num_levels > 1)
     {
-        _gauss5x5      = arm_compute::cpp14::make_unique<CLGaussian5x5[]>(num_levels - 1);
-        _scale_nearest = arm_compute::cpp14::make_unique<CLScaleKernel[]>(num_levels - 1);
+        _gauss5x5      = arm_compute::support::cpp14::make_unique<CLGaussian5x5[]>(num_levels - 1);
+        _scale_nearest = arm_compute::support::cpp14::make_unique<CLScaleKernel[]>(num_levels - 1);
 
         PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
 
diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
index b1b5a03..1470d5c 100644
--- a/src/runtime/CL/functions/CLHOGDescriptor.cpp
+++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp
@@ -31,8 +31,8 @@
 
 using namespace arm_compute;
 
-CLHOGDescriptor::CLHOGDescriptor()
-    : _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
+CLHOGDescriptor::CLHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
 {
 }
 
@@ -71,9 +71,16 @@
     TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
     _hog_space.allocator()->init(info_space);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_mag);
+    _memory_group.manage(&_phase);
+
     // Initialise gradient kernel
     _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_hog_space);
+
     // Initialise orientation binning kernel
     _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
 
@@ -88,6 +95,8 @@
 
 void CLHOGDescriptor::run()
 {
+    _memory_group.acquire();
+
     // Run gradient
     _gradient.run();
 
@@ -96,4 +105,6 @@
 
     // Run block normalization
     CLScheduler::get().enqueue(_block_norm);
+
+    _memory_group.release();
 }
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
index 2387474..51aeaed 100644
--- a/src/runtime/CL/functions/CLHOGGradient.cpp
+++ b/src/runtime/CL/functions/CLHOGGradient.cpp
@@ -29,8 +29,8 @@
 
 using namespace arm_compute;
 
-CLHOGGradient::CLHOGGradient()
-    : _derivative(), _mag_phase(), _gx(), _gy()
+CLHOGGradient::CLHOGGradient(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _derivative(), _mag_phase(), _gx(), _gy()
 {
 }
 
@@ -47,6 +47,10 @@
     _gx.allocator()->init(info);
     _gy.allocator()->init(info);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_gx);
+    _memory_group.manage(&_gy);
+
     // Initialise derivate kernel
     _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
 
@@ -67,9 +71,13 @@
 
 void CLHOGGradient::run()
 {
+    _memory_group.acquire();
+
     // Run derivative
     _derivative.run();
 
     // Run magnitude/phase kernel
     CLScheduler::get().enqueue(_mag_phase);
+
+    _memory_group.release();
 }
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
index b8f2224..8012c2f 100644
--- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp
+++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
@@ -25,17 +25,31 @@
 
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/CL/CLArray.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/Scheduler.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-CLHOGMultiDetection::CLHOGMultiDetection()
-    : _gradient_kernel(), _orient_bin_kernel(), _block_norm_kernel(), _hog_detect_kernel(), _non_maxima_kernel(), _hog_space(), _hog_norm_space(), _detection_windows(), _mag(), _phase(),
-      _non_maxima_suppression(false), _num_orient_bin_kernel(0), _num_block_norm_kernel(0), _num_hog_detect_kernel(0)
+CLHOGMultiDetection::CLHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _gradient_kernel(),
+      _orient_bin_kernel(),
+      _block_norm_kernel(),
+      _hog_detect_kernel(),
+      _non_maxima_kernel(),
+      _hog_space(),
+      _hog_norm_space(),
+      _detection_windows(),
+      _mag(),
+      _phase(),
+      _non_maxima_suppression(false),
+      _num_orient_bin_kernel(0),
+      _num_block_norm_kernel(0),
+      _num_hog_detect_kernel(0)
 {
 }
 
@@ -114,12 +128,12 @@
     _num_block_norm_kernel  = input_block_norm.size(); // Number of CLHOGBlockNormalizationKernel kernels to compute
     _num_hog_detect_kernel  = input_hog_detect.size(); // Number of CLHOGDetector functions to compute
 
-    _orient_bin_kernel = arm_compute::cpp14::make_unique<CLHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
-    _block_norm_kernel = arm_compute::cpp14::make_unique<CLHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
-    _hog_detect_kernel = arm_compute::cpp14::make_unique<CLHOGDetector[]>(_num_hog_detect_kernel);
-    _non_maxima_kernel = arm_compute::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
-    _hog_space         = arm_compute::cpp14::make_unique<CLTensor[]>(_num_orient_bin_kernel);
-    _hog_norm_space    = arm_compute::cpp14::make_unique<CLTensor[]>(_num_block_norm_kernel);
+    _orient_bin_kernel = arm_compute::support::cpp14::make_unique<CLHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
+    _block_norm_kernel = arm_compute::support::cpp14::make_unique<CLHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
+    _hog_detect_kernel = arm_compute::support::cpp14::make_unique<CLHOGDetector[]>(_num_hog_detect_kernel);
+    _non_maxima_kernel = arm_compute::support::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
+    _hog_space         = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_orient_bin_kernel);
+    _hog_norm_space    = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_block_norm_kernel);
 
     // Allocate tensors for magnitude and phase
     TensorInfo info_mag(shape_img, Format::S16);
@@ -128,6 +142,10 @@
     TensorInfo info_phase(shape_img, Format::U8);
     _phase.allocator()->init(info_phase);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_mag);
+    _memory_group.manage(&_phase);
+
     // Initialise gradient kernel
     _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
 
@@ -153,10 +171,17 @@
         TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
         _hog_space[i].allocator()->init(info_space);
 
+        // Manage intermediate buffers
+        _memory_group.manage(_hog_space.get() + i);
+
         // Initialise orientation binning kernel
         _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
     }
 
+    // Allocate intermediate tensors
+    _mag.allocator()->allocate();
+    _phase.allocator()->allocate();
+
     // Configure CLTensor for the normalized HOG space and block normalization kernel
     for(size_t i = 0; i < _num_block_norm_kernel; ++i)
     {
@@ -167,10 +192,19 @@
         TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
         _hog_norm_space[i].allocator()->init(tensor_info);
 
+        // Manage intermediate buffers
+        _memory_group.manage(_hog_norm_space.get() + i);
+
         // Initialize block normalization kernel
         _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
     }
 
+    // Allocate intermediate tensors
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        _hog_space[i].allocator()->allocate();
+    }
+
     detection_window_strides->map(CLScheduler::get().queue(), true);
 
     // Configure HOG detector kernel
@@ -187,14 +221,6 @@
     _non_maxima_kernel->configure(_detection_windows, min_distance);
 
     // Allocate intermediate tensors
-    _mag.allocator()->allocate();
-    _phase.allocator()->allocate();
-
-    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
-    {
-        _hog_space[i].allocator()->allocate();
-    }
-
     for(size_t i = 0; i < _num_block_norm_kernel; ++i)
     {
         _hog_norm_space[i].allocator()->allocate();
@@ -205,6 +231,8 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
 
+    _memory_group.acquire();
+
     // Reset detection window
     _detection_windows->clear();
 
@@ -234,7 +262,9 @@
     {
         // Map detection windows array before computing non maxima suppression
         _detection_windows->map(CLScheduler::get().queue(), true);
-        _non_maxima_kernel->run(_non_maxima_kernel->window());
+        Scheduler::get().schedule(_non_maxima_kernel.get(), Window::DimY);
         _detection_windows->unmap(CLScheduler::get().queue());
     }
-}
\ No newline at end of file
+
+    _memory_group.release();
+}
diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
index 2db277f..059528f 100644
--- a/src/runtime/CL/functions/CLHarrisCorners.cpp
+++ b/src/runtime/CL/functions/CLHarrisCorners.cpp
@@ -27,7 +27,6 @@
 #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
@@ -36,14 +35,28 @@
 #include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 #include "arm_compute/runtime/Scheduler.h"
+#include "support/ToolchainSupport.h"
 
 #include <cmath>
 #include <utility>
 
 using namespace arm_compute;
 
-CLHarrisCorners::CLHarrisCorners()
-    : _sobel(), _harris_score(), _non_max_suppr(), _candidates(), _sort_euclidean(), _border_gx(), _border_gy(), _gx(), _gy(), _score(), _nonmax(), _corners_list(), _num_corner_candidates(0),
+CLHarrisCorners::CLHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _sobel(nullptr),
+      _harris_score(),
+      _non_max_suppr(),
+      _candidates(),
+      _sort_euclidean(),
+      _border_gx(),
+      _border_gy(),
+      _gx(),
+      _gy(),
+      _score(),
+      _nonmax(),
+      _corners_list(nullptr),
+      _num_corner_candidates(0),
       _corners(nullptr)
 {
 }
@@ -62,6 +75,7 @@
     const TensorShape shape = input->info()->tensor_shape();
     const DataType    dt    = (gradient_size < 7) ? DataType::S16 : DataType::S32;
     TensorInfo        tensor_info(shape, 1, dt);
+
     _gx.allocator()->init(tensor_info);
     _gy.allocator()->init(tensor_info);
 
@@ -69,28 +83,32 @@
     _score.allocator()->init(info_f32);
     _nonmax.allocator()->init(info_f32);
 
-    _corners_list = arm_compute::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+    _corners_list = arm_compute::support::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+
+    // Manage intermediate buffers
+    _memory_group.manage(&_gx);
+    _memory_group.manage(&_gy);
 
     /* Set/init Sobel kernel accordingly with gradient_size */
     switch(gradient_size)
     {
         case 3:
         {
-            auto k = arm_compute::cpp14::make_unique<CLSobel3x3>();
+            auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3>();
             k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
             _sobel = std::move(k);
             break;
         }
         case 5:
         {
-            auto k = arm_compute::cpp14::make_unique<CLSobel5x5>();
+            auto k = arm_compute::support::cpp14::make_unique<CLSobel5x5>();
             k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
             _sobel = std::move(k);
             break;
         }
         case 7:
         {
-            auto k = arm_compute::cpp14::make_unique<CLSobel7x7>();
+            auto k = arm_compute::support::cpp14::make_unique<CLSobel7x7>();
             k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
             _sobel = std::move(k);
             break;
@@ -99,37 +117,49 @@
             ARM_COMPUTE_ERROR("Gradient size not implemented");
     }
 
-    // Configure border filling before harris score
-    _border_gx.configure(&_gx, block_size / 2, border_mode, constant_border_value);
-    _border_gy.configure(&_gy, block_size / 2, border_mode, constant_border_value);
-
     // Normalization factor
     const float norm_factor               = 1.0f / (255.0f * pow(4.0f, gradient_size / 2) * block_size);
     const float pow4_normalization_factor = pow(norm_factor, 4);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_score);
+
     // Set/init Harris Score kernel accordingly with block_size
     _harris_score.configure(&_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
 
-    // Init non-maxima suppression function
-    _non_max_suppr.configure(&_score, &_nonmax, border_mode == BorderMode::UNDEFINED);
-
-    // Init corner candidates kernel
-    _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
-
-    // Init euclidean distance
-    _sort_euclidean.configure(_corners_list.get(), _corners, &_num_corner_candidates, min_dist);
+    // Configure border filling using harris score kernel's block size
+    _border_gx.configure(&_gx, _harris_score.border_size(), border_mode, PixelValue(constant_border_value));
+    _border_gy.configure(&_gy, _harris_score.border_size(), border_mode, PixelValue(constant_border_value));
 
     // Allocate intermediate buffers
     _gx.allocator()->allocate();
     _gy.allocator()->allocate();
+
+    // Manage intermediate buffers
+    _memory_group.manage(&_nonmax);
+
+    // Init non-maxima suppression function
+    _non_max_suppr.configure(&_score, &_nonmax, border_mode);
+
+    // Allocate intermediate buffers
     _score.allocator()->allocate();
+
+    // Init corner candidates kernel
+    _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
+
+    // Allocate intermediate buffers
     _nonmax.allocator()->allocate();
+
+    // Init euclidean distance
+    _sort_euclidean.configure(_corners_list.get(), _corners, &_num_corner_candidates, min_dist);
 }
 
 void CLHarrisCorners::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
 
+    _memory_group.acquire();
+
     // Init to 0 number of corner candidates
     _num_corner_candidates = 0;
 
@@ -144,7 +174,7 @@
     CLScheduler::get().enqueue(_harris_score, false);
 
     // Run non-maxima suppression
-    CLScheduler::get().enqueue(_non_max_suppr);
+    _non_max_suppr.run();
 
     // Run corner candidate kernel
     _nonmax.map(true);
@@ -152,6 +182,8 @@
     _nonmax.unmap();
 
     _corners->map(CLScheduler::get().queue(), true);
-    _sort_euclidean.run(_sort_euclidean.window());
+    Scheduler::get().schedule(&_sort_euclidean, Window::DimY);
     _corners->unmap(CLScheduler::get().queue());
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLL2Normalize.cpp b/src/runtime/CL/functions/CLL2Normalize.cpp
new file mode 100644
index 0000000..99be8ca
--- /dev/null
+++ b/src/runtime/CL/functions/CLL2Normalize.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLL2Normalize.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLL2NormalizeKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLL2Normalize::CLL2Normalize(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
+{
+}
+
+void CLL2Normalize::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, float epsilon)
+{
+    // Manage intermediate buffers
+    _memory_group.manage(&_sumsq);
+
+    // Configure kernels
+    _reduce_func.configure(input, &_sumsq, axis, ReductionOperation::SUM_SQUARE);
+    _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon);
+
+    // Allocate intermediate tensor
+    _sumsq.allocator()->allocate();
+}
+
+void CLL2Normalize::run()
+{
+    _memory_group.acquire();
+
+    _reduce_func.run();
+    CLScheduler::get().enqueue(_normalize_kernel, true);
+
+    _memory_group.release();
+}
diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
index d7ce206..a395487 100644
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
@@ -24,7 +24,6 @@
 #include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IPyramid.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
@@ -33,11 +32,18 @@
 #include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
 #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
 #include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-CLLaplacianPyramid::CLLaplacianPyramid()
-    : _num_levels(0), _gaussian_pyr_function(), _convf(), _subf(), _depth_function(), _gauss_pyr(), _conv_pyr()
+CLLaplacianPyramid::CLLaplacianPyramid() // NOLINT
+    : _num_levels(0),
+      _gaussian_pyr_function(),
+      _convf(),
+      _subf(),
+      _depth_function(),
+      _gauss_pyr(),
+      _conv_pyr()
 {
 }
 
@@ -64,8 +70,8 @@
     // Create Gaussian Pyramid function
     _gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
 
-    _convf = arm_compute::cpp14::make_unique<CLGaussian5x5[]>(_num_levels);
-    _subf  = arm_compute::cpp14::make_unique<CLArithmeticSubtraction[]>(_num_levels);
+    _convf = arm_compute::support::cpp14::make_unique<CLGaussian5x5[]>(_num_levels);
+    _subf  = arm_compute::support::cpp14::make_unique<CLArithmeticSubtraction[]>(_num_levels);
 
     for(unsigned int i = 0; i < _num_levels; ++i)
     {
diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
index 1dfab74..678848b 100644
--- a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
+++ b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
@@ -24,18 +24,21 @@
 #include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IPyramid.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
 
 #include <cstddef>
 
 using namespace arm_compute;
 
-CLLaplacianReconstruct::CLLaplacianReconstruct()
-    : _tmp_pyr(), _addf(), _scalef(), _depthf()
+CLLaplacianReconstruct::CLLaplacianReconstruct() // NOLINT
+    : _tmp_pyr(),
+      _addf(),
+      _scalef(),
+      _depthf()
 {
 }
 
@@ -60,8 +63,8 @@
     _tmp_pyr.init(pyramid_info);
 
     // Allocate add and scale functions. Level 0 does not need to be scaled.
-    _addf   = arm_compute::cpp14::make_unique<CLArithmeticAddition[]>(num_levels);
-    _scalef = arm_compute::cpp14::make_unique<CLScale[]>(num_levels - 1);
+    _addf   = arm_compute::support::cpp14::make_unique<CLArithmeticAddition[]>(num_levels);
+    _scalef = arm_compute::support::cpp14::make_unique<CLScale[]>(num_levels - 1);
 
     const size_t last_level = num_levels - 1;
 
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index 263fb51..a89a45a 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -33,8 +33,9 @@
 
 using namespace arm_compute;
 
-CLLocallyConnectedLayer::CLLocallyConnectedLayer()
-    : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false)
+CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
+      _is_first_run(false)
 {
 }
 
@@ -68,8 +69,8 @@
     // Get convolved dimensions
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
-                                                 stride_x, stride_y, pad_x, pad_y, conv_info.round());
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
+                                                 conv_info);
 
     ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
     ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
@@ -99,8 +100,12 @@
     shape_gemm.set(1, mat_input_rows);
     _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_input_im2col_reshaped);
+    _memory_group.manage(&_gemm_output);
+
     // Configure kernels
-    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(conv_w, conv_h), conv_info, _has_bias);
     _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
     _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
     _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
@@ -120,6 +125,8 @@
         CLScheduler::get().enqueue(_weights_reshape_kernel);
     }
 
+    _memory_group.acquire();
+
     // Run input reshaping
     CLScheduler::get().enqueue(_input_im2col_kernel);
 
@@ -128,4 +135,6 @@
 
     // Reshape output matrix
     CLScheduler::get().enqueue(_output_col2im_kernel, false);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLMagnitude.cpp b/src/runtime/CL/functions/CLMagnitude.cpp
index 51088cb..68b8c35 100644
--- a/src/runtime/CL/functions/CLMagnitude.cpp
+++ b/src/runtime/CL/functions/CLMagnitude.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLMagnitude.h"
 
 #include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLMagnitude::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type)
 {
-    auto k = arm_compute::cpp14::make_unique<CLMagnitudePhaseKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLMagnitudePhaseKernel>();
     k->configure(input1, input2, output, nullptr, mag_type);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp
index 56ba146..838f7e7 100644
--- a/src/runtime/CL/functions/CLMeanStdDev.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDev.cpp
@@ -23,19 +23,19 @@
  */
 #include "arm_compute/runtime/CL/functions/CLMeanStdDev.h"
 
-#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 using namespace arm_compute;
 
 CLMeanStdDev::CLMeanStdDev()
     : _mean_stddev_kernel(),
+      _fill_border_kernel(),
       _global_sum(),
       _global_sum_squared()
 {
 }
 
-void CLMeanStdDev::configure(const ICLImage *input, float *mean, float *stddev)
+void CLMeanStdDev::configure(ICLImage *input, float *mean, float *stddev)
 {
     _global_sum = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
 
@@ -45,9 +45,11 @@
     }
 
     _mean_stddev_kernel.configure(input, mean, &_global_sum, stddev, &_global_sum_squared);
+    _fill_border_kernel.configure(input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
 }
 
 void CLMeanStdDev::run()
 {
+    CLScheduler::get().enqueue(_fill_border_kernel);
     CLScheduler::get().enqueue(_mean_stddev_kernel);
 }
diff --git a/src/runtime/CL/functions/CLMedian3x3.cpp b/src/runtime/CL/functions/CLMedian3x3.cpp
index 0c10f9a..55f9eaa 100644
--- a/src/runtime/CL/functions/CLMedian3x3.cpp
+++ b/src/runtime/CL/functions/CLMedian3x3.cpp
@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLMedian3x3.h"
 
 #include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void CLMedian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLMedian3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLMedian3x3Kernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLMinMaxLocation.cpp b/src/runtime/CL/functions/CLMinMaxLocation.cpp
index ad783d8..49dcbcb 100644
--- a/src/runtime/CL/functions/CLMinMaxLocation.cpp
+++ b/src/runtime/CL/functions/CLMinMaxLocation.cpp
@@ -25,8 +25,8 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 CLMinMaxLocation::CLMinMaxLocation()
     : _min_max_kernel(),
       _min_max_loc_kernel(),
@@ -41,7 +41,7 @@
 {
 }
 
-void CLMinMaxLocation::configure(const ICLImage *input, int32_t *min, int32_t *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
+void CLMinMaxLocation::configure(const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == min);
     ARM_COMPUTE_ERROR_ON(nullptr == max);
@@ -67,8 +67,8 @@
     CLScheduler::get().enqueue(_min_max_loc_kernel, false);
 
     // Update min and max
-    q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 0 * sizeof(int32_t), sizeof(int32_t), _min);
-    q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 1 * sizeof(int32_t), sizeof(int32_t), _max);
+    q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 0 * sizeof(int32_t), sizeof(int32_t), static_cast<int32_t *>(_min));
+    q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 1 * sizeof(int32_t), sizeof(int32_t), static_cast<int32_t *>(_max));
 
     // Update min and max count
     if(_min_count != nullptr)
@@ -96,3 +96,4 @@
         _max_loc->resize(max_corner_size);
     }
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLNonLinearFilter.cpp b/src/runtime/CL/functions/CLNonLinearFilter.cpp
index b593a6c..d37412f 100644
--- a/src/runtime/CL/functions/CLNonLinearFilter.cpp
+++ b/src/runtime/CL/functions/CLNonLinearFilter.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLNonLinearFilter.h"
 
 #include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 void CLNonLinearFilter::configure(ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
                                   BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLNonLinearFilterKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLNonLinearFilterKernel>();
     k->configure(input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
index ca7d5ae..c0a0cef 100644
--- a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
+++ b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
 
 #include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLNonMaximaSuppression3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode)
 {
-    auto k = arm_compute::cpp14::make_unique<CLNonMaximaSuppression3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLNonMaximaSuppression3x3Kernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
 
diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
index 2d89ebd..f4bd494 100644
--- a/src/runtime/CL/functions/CLNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp
@@ -33,28 +33,26 @@
 using namespace arm_compute;
 
 CLNormalizationLayer::CLNormalizationLayer()
-    : _squared_input(), _norm_kernel(), _multiply_kernel(), _border_handler()
+    : _norm_kernel(), _border_handler()
 {
 }
 
-void CLNormalizationLayer::configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info)
+void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info)
 {
     ARM_COMPUTE_ERROR_ON(input == nullptr);
 
-    _squared_input.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, input->info()->data_type()));
+    // Configure normalization kernel
+    _norm_kernel.configure(input, output, norm_info);
 
-    _norm_kernel.configure(input, &_squared_input, output, norm_info);
-    _multiply_kernel.configure(input, input, &_squared_input, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
     // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
-    _border_handler.configure(&_squared_input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
-
-    // Allocate intermediate buffers
-    _squared_input.allocator()->allocate();
+    _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
 }
 
 void CLNormalizationLayer::run()
 {
-    CLScheduler::get().enqueue(_multiply_kernel, false);
+    // Run border handler
     CLScheduler::get().enqueue(_border_handler, false);
-    CLScheduler::get().enqueue(_norm_kernel, false);
+
+    // Run normalization kernel
+    CLScheduler::get().enqueue(_norm_kernel);
 }
diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp
index a6b0eb3..d00b1b5 100644
--- a/src/runtime/CL/functions/CLOpticalFlow.cpp
+++ b/src/runtime/CL/functions/CLOpticalFlow.cpp
@@ -26,7 +26,6 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/CL/CLPyramid.h"
@@ -34,12 +33,27 @@
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
 #include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-CLOpticalFlow::CLOpticalFlow()
-    : _tracker_init_kernel(), _tracker_stage0_kernel(), _tracker_stage1_kernel(), _tracker_finalize_kernel(), _func_scharr(), _scharr_gx(), _scharr_gy(), _old_points(nullptr),
-      _new_points_estimates(nullptr), _new_points(nullptr), _old_points_internal(), _new_points_internal(), _coefficient_table(), _old_values(), _num_levels(0)
+CLOpticalFlow::CLOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _tracker_init_kernel(),
+      _tracker_stage0_kernel(),
+      _tracker_stage1_kernel(),
+      _tracker_finalize_kernel(),
+      _func_scharr(),
+      _scharr_gx(),
+      _scharr_gy(),
+      _old_points(nullptr),
+      _new_points_estimates(nullptr),
+      _new_points(nullptr),
+      _old_points_internal(),
+      _new_points_internal(),
+      _coefficient_table(),
+      _old_values(),
+      _num_levels(0)
 {
 }
 
@@ -70,21 +84,21 @@
     const int   old_values_list_length = list_length * window_dimension * window_dimension;
 
     // Create kernels and tensors
-    _tracker_init_kernel   = arm_compute::cpp14::make_unique<CLLKTrackerInitKernel[]>(_num_levels);
-    _tracker_stage0_kernel = arm_compute::cpp14::make_unique<CLLKTrackerStage0Kernel[]>(_num_levels);
-    _tracker_stage1_kernel = arm_compute::cpp14::make_unique<CLLKTrackerStage1Kernel[]>(_num_levels);
-    _func_scharr           = arm_compute::cpp14::make_unique<CLScharr3x3[]>(_num_levels);
-    _scharr_gx             = arm_compute::cpp14::make_unique<CLTensor[]>(_num_levels);
-    _scharr_gy             = arm_compute::cpp14::make_unique<CLTensor[]>(_num_levels);
+    _tracker_init_kernel   = arm_compute::support::cpp14::make_unique<CLLKTrackerInitKernel[]>(_num_levels);
+    _tracker_stage0_kernel = arm_compute::support::cpp14::make_unique<CLLKTrackerStage0Kernel[]>(_num_levels);
+    _tracker_stage1_kernel = arm_compute::support::cpp14::make_unique<CLLKTrackerStage1Kernel[]>(_num_levels);
+    _func_scharr           = arm_compute::support::cpp14::make_unique<CLScharr3x3[]>(_num_levels);
+    _scharr_gx             = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_levels);
+    _scharr_gy             = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_levels);
 
     // Create internal keypoint arrays
-    _old_points_internal = arm_compute::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
+    _old_points_internal = arm_compute::support::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
     _old_points_internal->resize(list_length);
-    _new_points_internal = arm_compute::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
+    _new_points_internal = arm_compute::support::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
     _new_points_internal->resize(list_length);
-    _coefficient_table = arm_compute::cpp14::make_unique<CLCoefficientTableArray>(list_length);
+    _coefficient_table = arm_compute::support::cpp14::make_unique<CLCoefficientTableArray>(list_length);
     _coefficient_table->resize(list_length);
-    _old_values = arm_compute::cpp14::make_unique<CLOldValueArray>(old_values_list_length);
+    _old_values = arm_compute::support::cpp14::make_unique<CLOldValueArray>(old_values_list_length);
     _old_values->resize(old_values_list_length);
     _new_points->resize(list_length);
 
@@ -103,6 +117,10 @@
         _scharr_gx[i].allocator()->init(tensor_info);
         _scharr_gy[i].allocator()->init(tensor_info);
 
+        // Manage intermediate buffers
+        _memory_group.manage(_scharr_gx.get() + i);
+        _memory_group.manage(_scharr_gy.get() + i);
+
         // Init Scharr kernel
         _func_scharr[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
 
@@ -131,6 +149,8 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
 
+    _memory_group.acquire();
+
     for(unsigned int level = _num_levels; level > 0; --level)
     {
         // Run Scharr kernel
@@ -147,4 +167,6 @@
     }
 
     CLScheduler::get().enqueue(_tracker_finalize_kernel, true);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLPhase.cpp b/src/runtime/CL/functions/CLPhase.cpp
index a8cb22b..cf3fa7e 100644
--- a/src/runtime/CL/functions/CLPhase.cpp
+++ b/src/runtime/CL/functions/CLPhase.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLPhase.h"
 
 #include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLPhase::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type)
 {
-    auto k = arm_compute::cpp14::make_unique<CLMagnitudePhaseKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLMagnitudePhaseKernel>();
     k->configure(input1, input2, nullptr, output, MagnitudeType::L1NORM, phase_type);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index 8a86c2e..139d466 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
 
 #include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 void CLPixelWiseMultiplication::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
                                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
-    auto k = arm_compute::cpp14::make_unique<CLPixelWiseMultiplicationKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLPixelWiseMultiplicationKernel>();
     k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 1ef70f4..2cb7d63 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -24,14 +24,14 @@
 #include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
 
 #include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info)
 {
     // Configure pooling kernel
-    auto k = arm_compute::cpp14::make_unique<CLPoolingLayerKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLPoolingLayerKernel>();
     k->configure(input, output, pool_info);
     _kernel = std::move(k);
 
diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
new file mode 100644
index 0000000..ed1f51c
--- /dev/null
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLQuantizationLayer::CLQuantizationLayer()
+    : _quantize_kernel(), _min_max_kernel(), _min_max()
+{
+}
+
+void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    // Configure min-max kernel. _min_max tensor will be auto-configured within the kernel.
+    _min_max_kernel.configure(input, &_min_max);
+
+    // Configure quantize kernel
+    _quantize_kernel.configure(input, output, &_min_max);
+
+    // Allocate min_max tensor
+    _min_max.allocator()->allocate();
+}
+
+void CLQuantizationLayer::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+
+    // Reset min and max
+    _min_max_kernel.reset(q);
+
+    // Run min-max kernel
+    CLScheduler::get().enqueue(_min_max_kernel, false);
+
+    // Run quantize kernel
+    CLScheduler::get().enqueue(_quantize_kernel, false);
+}
diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
new file mode 100644
index 0000000..0f480ee
--- /dev/null
+++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h"
+
+#include "arm_compute/core/CL/ICLArray.h"
+
+#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLROIArray *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+{
+    // Configure ROI pooling kernel
+    auto k = arm_compute::support::cpp14::make_unique<CLROIPoolingLayerKernel>();
+    k->configure(input, rois, output, pool_info);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
new file mode 100644
index 0000000..d02afb4
--- /dev/null
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages()
+{
+}
+
+void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+{
+    // Calculate number of WGs. 16 elements per thread, 8 threads per WG
+    unsigned int num_of_wg = ceil(input->info()->dimension(0) / 128.f);
+
+    // Calculate number of stages. First stage performs op and the rest reduction sum
+    // depending on the size of the input. Last stage should have only 1 WG.
+    _num_of_stages = num_of_wg / 128 + 2;
+
+    // Create temporary tensors
+    _sums_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
+
+    // Configure reduction operation kernels
+    _reduction_kernels_vector = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel[]>(_num_of_stages);
+    _border_handlers_vector   = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
+
+    TensorShape shape{ input->info()->tensor_shape() };
+    for(unsigned int i = 0; i < _num_of_stages - 1; i++)
+    {
+        shape.set(0, ceil(shape.x() / 128.f));
+        _sums_vector[i].allocator()->init(TensorInfo(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+    }
+
+    // Apply ReductionOperation only on first kernel
+    _memory_group.manage(_sums_vector.get());
+    _reduction_kernels_vector[0].configure(input, _sums_vector.get(), axis, op);
+    _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+    // Apply ReductionOperation on intermediate stages
+    for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
+    {
+        _memory_group.manage(_sums_vector.get() + i);
+        _reduction_kernels_vector[i].configure(_sums_vector.get() + i - 1, _sums_vector.get() + i, axis, ReductionOperation::SUM);
+        _border_handlers_vector[i].configure(_sums_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+        _sums_vector[i - 1].allocator()->allocate();
+    }
+
+    // Apply ReductionOperation on the last stage
+    const unsigned int last_stage = _num_of_stages - 1;
+    _reduction_kernels_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, output, axis, ReductionOperation::SUM);
+    _border_handlers_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, PixelValue(0));
+    _sums_vector[last_stage - 1].allocator()->allocate();
+}
+
+void CLReductionOperation::run()
+{
+    _memory_group.acquire();
+
+    for(unsigned int i = 0; i < _num_of_stages; ++i)
+    {
+        CLScheduler::get().enqueue(_border_handlers_vector[i], false);
+        CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
+    }
+
+    _memory_group.release();
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLRemap.cpp b/src/runtime/CL/functions/CLRemap.cpp
index f6b1713..bc3fd4e 100644
--- a/src/runtime/CL/functions/CLRemap.cpp
+++ b/src/runtime/CL/functions/CLRemap.cpp
@@ -26,10 +26,10 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLRemapKernel.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -43,7 +43,7 @@
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
 
-    auto k = arm_compute::cpp14::make_unique<CLRemapKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLRemapKernel>();
     k->configure(input, map_x, map_y, output, policy, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp
new file mode 100644
index 0000000..2ce83dc
--- /dev/null
+++ b/src/runtime/CL/functions/CLReshapeLayer.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLReshapeLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLReshapeLayerKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index 043f873..49b0275 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp
@@ -26,19 +26,14 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLScaleKernel.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value)
 {
-    ARM_COMPUTE_ERROR_ON(output == input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    auto k = arm_compute::cpp14::make_unique<CLScaleKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLScaleKernel>();
     k->configure(input, output, policy, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
diff --git a/src/runtime/CL/functions/CLScharr3x3.cpp b/src/runtime/CL/functions/CLScharr3x3.cpp
index c8bc465..73f8673 100644
--- a/src/runtime/CL/functions/CLScharr3x3.cpp
+++ b/src/runtime/CL/functions/CLScharr3x3.cpp
@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
 
 #include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void CLScharr3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLScharr3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLScharr3x3Kernel>();
     k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLSobel3x3.cpp b/src/runtime/CL/functions/CLSobel3x3.cpp
index 6b74eba..e227e58 100644
--- a/src/runtime/CL/functions/CLSobel3x3.cpp
+++ b/src/runtime/CL/functions/CLSobel3x3.cpp
@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
 
 #include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void CLSobel3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLSobel3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3Kernel>();
     k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLSobel5x5.cpp b/src/runtime/CL/functions/CLSobel5x5.cpp
index 098b546..d4bc855 100644
--- a/src/runtime/CL/functions/CLSobel5x5.cpp
+++ b/src/runtime/CL/functions/CLSobel5x5.cpp
@@ -33,8 +33,8 @@
 
 using namespace arm_compute;
 
-CLSobel5x5::CLSobel5x5()
-    : _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
+CLSobel5x5::CLSobel5x5(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
 {
 }
 
@@ -51,6 +51,8 @@
     {
         _tmp_x.allocator()->init(tensor_info);
         _tmp_y.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_x);
+        _memory_group.manage(&_tmp_y);
         _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
@@ -59,6 +61,7 @@
     else if(run_sobel_x)
     {
         _tmp_x.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_x);
         _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
@@ -66,6 +69,7 @@
     else if(run_sobel_y)
     {
         _tmp_y.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_y);
         _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_y.allocator()->allocate();
@@ -76,6 +80,11 @@
 void CLSobel5x5::run()
 {
     CLScheduler::get().enqueue(_border_handler, false);
+
+    _memory_group.acquire();
+
     CLScheduler::get().enqueue(_sobel_hor, false);
     CLScheduler::get().enqueue(_sobel_vert);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLSobel7x7.cpp b/src/runtime/CL/functions/CLSobel7x7.cpp
index db84fa9..6083090 100644
--- a/src/runtime/CL/functions/CLSobel7x7.cpp
+++ b/src/runtime/CL/functions/CLSobel7x7.cpp
@@ -33,8 +33,8 @@
 
 using namespace arm_compute;
 
-CLSobel7x7::CLSobel7x7()
-    : _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
+CLSobel7x7::CLSobel7x7(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
 {
 }
 
@@ -51,6 +51,8 @@
     {
         _tmp_x.allocator()->init(tensor_info);
         _tmp_y.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_x);
+        _memory_group.manage(&_tmp_y);
         _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
@@ -59,6 +61,7 @@
     else if(run_sobel_x)
     {
         _tmp_x.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_x);
         _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
@@ -66,6 +69,7 @@
     else if(run_sobel_y)
     {
         _tmp_y.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_y);
         _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_y.allocator()->allocate();
@@ -76,6 +80,11 @@
 void CLSobel7x7::run()
 {
     CLScheduler::get().enqueue(_border_handler, false);
+
+    _memory_group.acquire();
+
     CLScheduler::get().enqueue(_sobel_hor, false);
     CLScheduler::get().enqueue(_sobel_vert);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index 2a78c58..7505a2c 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -25,29 +25,34 @@
 
 #include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 using namespace arm_compute;
 
-CLSoftmaxLayer::CLSoftmaxLayer()
-    : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
+CLSoftmaxLayer::CLSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
 {
 }
 
 void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
 
     // Create intermediate tensors shapes
-    _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type()));
+    _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
 
     TensorShape shape = input->info()->tensor_shape();
     shape.set(0, 1);
-    TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type());
+    TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
     _max.allocator()->init(tensor_info_max_sum);
     _sum.allocator()->init(tensor_info_max_sum);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_tmp);
+    _memory_group.manage(&_max);
+    _memory_group.manage(&_sum);
+
     // Configure Kernels
     _max_kernel.configure(input, &_max);
     _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
@@ -61,7 +66,11 @@
 
 void CLSoftmaxLayer::run()
 {
+    _memory_group.acquire();
+
     CLScheduler::get().enqueue(_max_kernel, false);
     CLScheduler::get().enqueue(_shift_exp_sum_kernel, false);
     CLScheduler::get().enqueue(_norm_kernel);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLTableLookup.cpp b/src/runtime/CL/functions/CLTableLookup.cpp
index 743ed5e..d187650 100644
--- a/src/runtime/CL/functions/CLTableLookup.cpp
+++ b/src/runtime/CL/functions/CLTableLookup.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLTableLookup.h"
 
 #include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLTableLookup::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLTableLookupKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLTableLookupKernel>();
     k->configure(input, lut, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLThreshold.cpp b/src/runtime/CL/functions/CLThreshold.cpp
index e70f932..1b30b77 100644
--- a/src/runtime/CL/functions/CLThreshold.cpp
+++ b/src/runtime/CL/functions/CLThreshold.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLThreshold.h"
 
 #include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLThreshold::configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
 {
-    auto k = arm_compute::cpp14::make_unique<CLThresholdKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLThresholdKernel>();
     k->configure(input, output, threshold, false_value, true_value, type, upper);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp
index d802b4f..cd19e25 100644
--- a/src/runtime/CL/functions/CLTranspose.cpp
+++ b/src/runtime/CL/functions/CLTranspose.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLTranspose.h"
 
 #include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLTranspose::configure(const ICLTensor *input, ICLTensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLTransposeKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLWarpAffine.cpp b/src/runtime/CL/functions/CLWarpAffine.cpp
index 537e0d9..f785c75 100644
--- a/src/runtime/CL/functions/CLWarpAffine.cpp
+++ b/src/runtime/CL/functions/CLWarpAffine.cpp
@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLWarpAffine.h"
 
 #include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void CLWarpAffine::configure(ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLWarpAffineKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLWarpAffineKernel>();
     k->configure(input, output, matrix, policy);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLWarpPerspective.cpp b/src/runtime/CL/functions/CLWarpPerspective.cpp
index a552ab4..b445b3b 100644
--- a/src/runtime/CL/functions/CLWarpPerspective.cpp
+++ b/src/runtime/CL/functions/CLWarpPerspective.cpp
@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLWarpPerspective.h"
 
 #include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void CLWarpPerspective::configure(ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLWarpPerspectiveKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLWarpPerspectiveKernel>();
     k->configure(input, output, matrix, policy);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 8869330..a83a0bc 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -28,91 +28,89 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
 
+#include <condition_variable>
 #include <iostream>
-#include <semaphore.h>
+#include <mutex>
 #include <system_error>
 #include <thread>
 
-using namespace arm_compute;
-
-class arm_compute::Thread
+namespace arm_compute
+{
+class Thread
 {
 public:
-    /** Start a new thread
-     */
+    /** Start a new thread. */
     Thread();
+
     Thread(const Thread &) = delete;
     Thread &operator=(const Thread &) = delete;
     Thread(Thread &&)                 = delete;
     Thread &operator=(Thread &&) = delete;
-    /** Make the thread join
-     */
+
+    /** Destructor. Make the thread join. */
     ~Thread();
+
     /** Request the worker thread to start executing the given kernel
      * This function will return as soon as the kernel has been sent to the worker thread.
      * wait() needs to be called to ensure the execution is complete.
      */
-    void start(ICPPKernel *kernel, const Window &window);
-    /** Wait for the current kernel execution to complete
-     */
+    void start(ICPPKernel *kernel, const Window &window, const ThreadInfo &info);
+
+    /** Wait for the current kernel execution to complete. */
     void wait();
-    /** Function ran by the worker thread
-     */
+
+    /** Function ran by the worker thread. */
     void worker_thread();
 
 private:
-    std::thread        _thread;
-    ICPPKernel        *_kernel{ nullptr };
-    Window             _window;
-    sem_t              _wait_for_work;
-    sem_t              _job_complete;
-    std::exception_ptr _current_exception;
+    std::thread             _thread;
+    ICPPKernel             *_kernel{ nullptr };
+    Window                  _window;
+    ThreadInfo              _info;
+    std::mutex              _m;
+    std::condition_variable _cv;
+    bool                    _wait_for_work{ false };
+    bool                    _job_complete{ true };
+    std::exception_ptr      _current_exception;
 };
 
 Thread::Thread()
-    : _thread(), _window(), _wait_for_work(), _job_complete(), _current_exception(nullptr)
+    : _thread(), _window(), _info(), _m(), _cv(), _current_exception(nullptr)
 {
-    int ret = sem_init(&_wait_for_work, 0, 0);
-    ARM_COMPUTE_ERROR_ON(ret < 0);
-    ARM_COMPUTE_UNUSED(ret);
-
-    ret = sem_init(&_job_complete, 0, 0);
-    ARM_COMPUTE_ERROR_ON(ret < 0);
-    ARM_COMPUTE_UNUSED(ret);
-
     _thread = std::thread(&Thread::worker_thread, this);
 }
 
 Thread::~Thread()
 {
-    ARM_COMPUTE_ERROR_ON(!_thread.joinable());
-
-    start(nullptr, Window());
-    _thread.join();
-
-    int ret = sem_destroy(&_wait_for_work);
-    ARM_COMPUTE_ERROR_ON(ret < 0);
-    ARM_COMPUTE_UNUSED(ret);
-
-    ret = sem_destroy(&_job_complete);
-    ARM_COMPUTE_ERROR_ON(ret < 0);
-    ARM_COMPUTE_UNUSED(ret);
+    // Make sure worker thread has ended
+    if(_thread.joinable())
+    {
+        start(nullptr, Window(), ThreadInfo());
+        _thread.join();
+    }
 }
 
-void Thread::start(ICPPKernel *kernel, const Window &window)
+void Thread::start(ICPPKernel *kernel, const Window &window, const ThreadInfo &info)
 {
     _kernel = kernel;
     _window = window;
-    int ret = sem_post(&_wait_for_work);
-    ARM_COMPUTE_UNUSED(ret);
-    ARM_COMPUTE_ERROR_ON(ret < 0);
+    _info   = info;
+
+    {
+        std::lock_guard<std::mutex> lock(_m);
+        _wait_for_work = true;
+        _job_complete  = false;
+    }
+    _cv.notify_one();
 }
 
 void Thread::wait()
 {
-    int ret = sem_wait(&_job_complete);
-    ARM_COMPUTE_UNUSED(ret);
-    ARM_COMPUTE_ERROR_ON(ret < 0);
+    {
+        std::unique_lock<std::mutex> lock(_m);
+        _cv.wait(lock, [&] { return _job_complete; });
+    }
+
     if(_current_exception)
     {
         std::rethrow_exception(_current_exception);
@@ -121,9 +119,14 @@
 
 void Thread::worker_thread()
 {
-    while(sem_wait(&_wait_for_work) >= 0)
+    while(true)
     {
+        std::unique_lock<std::mutex> lock(_m);
+        _cv.wait(lock, [&] { return _wait_for_work; });
+        _wait_for_work = false;
+
         _current_exception = nullptr;
+
         // Time to exit
         if(_kernel == nullptr)
         {
@@ -133,49 +136,40 @@
         try
         {
             _window.validate();
-            _kernel->run(_window);
+            _kernel->run(_window, _info);
         }
         catch(...)
         {
             _current_exception = std::current_exception();
         }
-        int ret = sem_post(&_job_complete);
-        ARM_COMPUTE_UNUSED(ret);
-        ARM_COMPUTE_ERROR_ON(ret < 0);
+
+        _job_complete = true;
+        lock.unlock();
+        _cv.notify_one();
     }
-
-    ARM_COMPUTE_ERROR("Wait failed");
 }
 
-namespace
-{
-void delete_threads(Thread *t)
-{
-    delete[] t;
-}
-} // namespace
-
 CPPScheduler &CPPScheduler::get()
 {
     static CPPScheduler scheduler;
     return scheduler;
 }
 
-unsigned int CPPScheduler::num_threads() const
-{
-    return _num_threads;
-}
-
 CPPScheduler::CPPScheduler()
     : _num_threads(std::thread::hardware_concurrency()),
-      _threads(std::unique_ptr<Thread[], void(*)(Thread *)>(new Thread[std::thread::hardware_concurrency() - 1], delete_threads))
+      _threads(_num_threads - 1)
 {
 }
 
 void CPPScheduler::set_num_threads(unsigned int num_threads)
 {
-    const unsigned int num_cores = std::thread::hardware_concurrency();
-    _num_threads                 = num_threads == 0 ? num_cores : num_threads;
+    _num_threads = num_threads == 0 ? std::thread::hardware_concurrency() : num_threads;
+    _threads.resize(_num_threads - 1);
+}
+
+unsigned int CPPScheduler::num_threads() const
+{
+    return _num_threads;
 }
 
 void CPPScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
@@ -183,43 +177,51 @@
     ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
 
     /** [Scheduler example] */
+    ThreadInfo info;
+    info.cpu_info = _info;
+
     const Window      &max_window     = kernel->window();
     const unsigned int num_iterations = max_window.num_iterations(split_dimension);
-    const unsigned int num_threads    = std::min(num_iterations, _num_threads);
+    info.num_threads                  = std::min(num_iterations, _num_threads);
 
-    if(!kernel->is_parallelisable() || 1 == num_threads)
+    if(num_iterations == 0)
     {
-        kernel->run(max_window);
+        return;
+    }
+
+    if(!kernel->is_parallelisable() || info.num_threads == 1)
+    {
+        kernel->run(max_window, info);
     }
     else
     {
-        for(unsigned int t = 0; t < num_threads; ++t)
-        {
-            Window win = max_window.split_window(split_dimension, t, num_threads);
-            win.set_thread_id(t);
-            win.set_num_threads(num_threads);
+        int  t         = 0;
+        auto thread_it = _threads.begin();
 
-            if(t != num_threads - 1)
-            {
-                _threads[t].start(kernel, win);
-            }
-            else
-            {
-                kernel->run(win);
-            }
+        for(; t < info.num_threads - 1; ++t, ++thread_it)
+        {
+            Window win     = max_window.split_window(split_dimension, t, info.num_threads);
+            info.thread_id = t;
+            thread_it->start(kernel, win, info);
         }
 
+        // Run last part on main thread
+        Window win     = max_window.split_window(split_dimension, t, info.num_threads);
+        info.thread_id = t;
+        kernel->run(win, info);
+
         try
         {
-            for(unsigned int t = 1; t < num_threads; ++t)
+            for(auto &thread : _threads)
             {
-                _threads[t - 1].wait();
+                thread.wait();
             }
         }
         catch(const std::system_error &e)
         {
-            std::cout << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
+            std::cerr << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
         }
     }
     /** [Scheduler example] */
 }
+} // namespace arm_compute
diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
index f086813..c8285b4 100644
--- a/src/runtime/CPP/SingleThreadScheduler.cpp
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp
@@ -27,8 +27,8 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Utils.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 SingleThreadScheduler &SingleThreadScheduler::get()
 {
     static SingleThreadScheduler scheduler;
@@ -38,15 +38,19 @@
 void SingleThreadScheduler::set_num_threads(unsigned int num_threads)
 {
     ARM_COMPUTE_UNUSED(num_threads);
+    ARM_COMPUTE_ERROR_ON(num_threads != 1);
 }
 
 void SingleThreadScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
 {
     ARM_COMPUTE_UNUSED(split_dimension);
-    kernel->run(kernel->window());
+    ThreadInfo info;
+    info.cpu_info = cpu_info();
+    kernel->run(kernel->window(), info);
 }
 
 unsigned int SingleThreadScheduler::num_threads() const
 {
     return 1;
 }
+} // namespace arm_compute
diff --git a/src/runtime/Distribution1D.cpp b/src/runtime/Distribution1D.cpp
index b067674..3431834 100644
--- a/src/runtime/Distribution1D.cpp
+++ b/src/runtime/Distribution1D.cpp
@@ -24,14 +24,14 @@
 #include "arm_compute/runtime/Distribution1D.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <cstdint>
 
 using namespace arm_compute;
 
 Distribution1D::Distribution1D(size_t num_bins, int32_t offset, uint32_t range)
-    : IDistribution1D(num_bins, offset, range), _data(arm_compute::cpp14::make_unique<uint32_t[]>(num_bins))
+    : IDistribution1D(num_bins, offset, range), _data(arm_compute::support::cpp14::make_unique<uint32_t[]>(num_bins))
 {
 }
 
diff --git a/src/runtime/HOG.cpp b/src/runtime/HOG.cpp
index 5d533dd..01640bb 100644
--- a/src/runtime/HOG.cpp
+++ b/src/runtime/HOG.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/HOG.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
@@ -37,7 +37,7 @@
 {
     ARM_COMPUTE_ERROR_ON(nullptr != _descriptor);
     _info       = input;
-    _descriptor = arm_compute::cpp14::make_unique<float[]>(_info.descriptor_size());
+    _descriptor = arm_compute::support::cpp14::make_unique<float[]>(_info.descriptor_size());
 }
 
 float *HOG::descriptor() const
diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
new file mode 100644
index 0000000..4292469
--- /dev/null
+++ b/src/runtime/IScheduler.cpp
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/IScheduler.h"
+
+#include <array>
+#include <cstdlib>
+#include <cstring>
+#include <fcntl.h>
+#include <sched.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+namespace
+{
+unsigned int get_cpu_impl()
+{
+#ifndef BARE_METAL
+    int fd = open("/proc/cpuinfo", 0); // NOLINT
+    std::array<char, 1200> buff{ {} };
+    char *pos     = nullptr;
+    char *end     = nullptr;
+    bool  foundid = false;
+
+    int cpu = sched_getcpu();
+
+    if(fd == -1)
+    {
+        return 0;
+    }
+
+    int charsread = read(fd, buff.data(), 1200);
+    pos           = buff.data();
+    end           = buff.data() + charsread;
+
+    close(fd);
+
+    /* So, to date I've encountered two formats for /proc/cpuinfo.
+     *
+     * One of them just lists processor : n  for each processor (with no
+     * other info), then at the end lists part information for the current
+     * CPU.
+     *
+     * The other has an entire clause (including part number info) for each
+     * CPU in the system, with "processor : n" headers.
+     *
+     * We can cope with either of these formats by waiting to see
+     * "processor: n" (where n = our CPU ID), and then looking for the next
+     * "CPU part" field.
+     */
+    while(pos < end)
+    {
+        if(foundid && strncmp(pos, "CPU part", 8) == 0)
+        {
+            /* Found part number */
+            pos += 11;
+
+            for(char *ch = pos; ch < end; ch++)
+            {
+                if(*ch == '\n')
+                {
+                    *ch = '\0';
+                    break;
+                }
+            }
+
+            return strtoul(pos, nullptr, 0);
+        }
+
+        if(strncmp(pos, "processor", 9) == 0)
+        {
+            /* Found processor ID, see if it's ours. */
+            pos += 11;
+
+            for(char *ch = pos; ch < end; ch++)
+            {
+                if(*ch == '\n')
+                {
+                    *ch = '\0';
+                    break;
+                }
+            }
+
+            int num = strtol(pos, nullptr, 0);
+
+            if(num == cpu)
+            {
+                foundid = true;
+            }
+        }
+
+        while(pos < end)
+        {
+            char ch = *pos++;
+            if(ch == '\n' || ch == '\0')
+            {
+                break;
+            }
+        }
+    }
+#endif /* BARE_METAL */
+
+    return 0;
+}
+} // namespace
+
+namespace arm_compute
+{
+IScheduler::IScheduler()
+{
+    switch(get_cpu_impl())
+    {
+        case 0xd03:
+            _info.CPU = CPUTarget::A53;
+            break;
+        default:
+#ifdef __arm__
+            _info.CPU = CPUTarget::ARMV7;
+#elif __aarch64__
+            _info.CPU = CPUTarget::ARMV8;
+#else  /* __arm__ || __aarch64__ */
+            _info.CPU = CPUTarget::INTRINSICS;
+#endif /* __arm__ || __aarch64__ */
+            break;
+    }
+
+    _info.L1_size = 31000;
+    _info.L2_size = 500000;
+}
+
+void IScheduler::set_target(CPUTarget target)
+{
+    _info.CPU = target;
+}
+
+CPUInfo IScheduler::cpu_info() const
+{
+    return _info;
+}
+} // namespace arm_compute
diff --git a/src/runtime/LutAllocator.cpp b/src/runtime/LutAllocator.cpp
index 17baf21..eb9051c 100644
--- a/src/runtime/LutAllocator.cpp
+++ b/src/runtime/LutAllocator.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/LutAllocator.h"
 
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
@@ -39,7 +39,7 @@
 
 void LutAllocator::allocate()
 {
-    _buffer = arm_compute::cpp14::make_unique<uint8_t[]>(size());
+    _buffer = arm_compute::support::cpp14::make_unique<uint8_t[]>(size());
 }
 
 uint8_t *LutAllocator::lock()
diff --git a/src/runtime/MemoryManagerOnDemand.cpp b/src/runtime/MemoryManagerOnDemand.cpp
new file mode 100644
index 0000000..4dfa28b
--- /dev/null
+++ b/src/runtime/MemoryManagerOnDemand.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/ILifetimeManager.h"
+#include "arm_compute/runtime/IPoolManager.h"
+
+#include <memory>
+
+using namespace arm_compute;
+
+MemoryManagerOnDemand::MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager, std::shared_ptr<IPoolManager> pool_manager)
+    : _lifetime_mgr(std::move(lifetime_manager)), _pool_mgr(std::move(pool_manager)), _allocator(nullptr), _is_finalized(false), _num_pools(1)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!_lifetime_mgr, "Lifetime manager not specified correctly!");
+    ARM_COMPUTE_ERROR_ON_MSG(!_pool_mgr, "Pool manager not specified correctly!");
+}
+
+bool MemoryManagerOnDemand::is_finalized() const
+{
+    return _is_finalized;
+}
+
+void MemoryManagerOnDemand::set_num_pools(unsigned int num_pools)
+{
+    ARM_COMPUTE_ERROR_ON(num_pools == 0);
+    _num_pools = num_pools;
+}
+
+void MemoryManagerOnDemand::set_allocator(IAllocator *allocator)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(is_finalized(), "Memory manager is already finalized!");
+    ARM_COMPUTE_ERROR_ON(allocator == nullptr);
+    _allocator = allocator;
+}
+
+ILifetimeManager *MemoryManagerOnDemand::lifetime_manager()
+{
+    return _lifetime_mgr.get();
+}
+
+IPoolManager *MemoryManagerOnDemand::pool_manager()
+{
+    return _pool_mgr.get();
+}
+
+void MemoryManagerOnDemand::finalize()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(is_finalized(), "Memory manager is already finalized!");
+    ARM_COMPUTE_ERROR_ON(!_lifetime_mgr);
+    ARM_COMPUTE_ERROR_ON(!_pool_mgr);
+    ARM_COMPUTE_ERROR_ON_MSG(!_lifetime_mgr->are_all_finalized(), "All the objects have not been finalized! ");
+    ARM_COMPUTE_ERROR_ON(_allocator == nullptr);
+
+    // Create pools
+    auto pool_template = _lifetime_mgr->create_pool(_allocator);
+    for(int i = _num_pools; i > 1; --i)
+    {
+        auto pool = pool_template->duplicate();
+        _pool_mgr->register_pool(std::move(pool));
+    }
+    _pool_mgr->register_pool(std::move(pool_template));
+
+    // Set finalized to true
+    _is_finalized = true;
+}
diff --git a/src/runtime/MultiHOG.cpp b/src/runtime/MultiHOG.cpp
index 003dc93..e0b60b1 100644
--- a/src/runtime/MultiHOG.cpp
+++ b/src/runtime/MultiHOG.cpp
@@ -24,13 +24,13 @@
 #include "arm_compute/runtime/MultiHOG.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IMultiHOG.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 MultiHOG::MultiHOG(size_t num_models)
-    : _num_models(num_models), _model(arm_compute::cpp14::make_unique<HOG[]>(_num_models))
+    : _num_models(num_models), _model(arm_compute::support::cpp14::make_unique<HOG[]>(_num_models))
 {
 }
 
diff --git a/src/runtime/NEON/INESimpleFunction.cpp b/src/runtime/NEON/INESimpleFunction.cpp
index 6f0da85..23d9872 100644
--- a/src/runtime/NEON/INESimpleFunction.cpp
+++ b/src/runtime/NEON/INESimpleFunction.cpp
@@ -27,13 +27,14 @@
 
 using namespace arm_compute;
 
-INESimpleFunction::INESimpleFunction()
-    : _kernel(), _border_handler()
+INESimpleFunction::INESimpleFunction() // NOLINT
+    : _kernel(),
+      _border_handler()
 {
 }
 
 void INESimpleFunction::run()
 {
-    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
     NEScheduler::get().schedule(_kernel.get(), Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NEAbsoluteDifference.cpp b/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
index b39feb3..b4620f1 100644
--- a/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
+++ b/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NEAbsoluteDifference::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEAbsoluteDifferenceKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEAbsoluteDifferenceKernel>();
     k->configure(input1, input2, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEAccumulate.cpp b/src/runtime/NEON/functions/NEAccumulate.cpp
index c39abfc..49524d2 100644
--- a/src/runtime/NEON/functions/NEAccumulate.cpp
+++ b/src/runtime/NEON/functions/NEAccumulate.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEAccumulate.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEAccumulateKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NEAccumulate::configure(const ITensor *input, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEAccumulateKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEAccumulateKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
@@ -41,13 +41,13 @@
 {
     if(use_fp16)
     {
-        auto k = arm_compute::cpp14::make_unique<NEAccumulateWeightedFP16Kernel>();
+        auto k = arm_compute::support::cpp14::make_unique<NEAccumulateWeightedFP16Kernel>();
         k->configure(input, alpha, output);
         _kernel = std::move(k);
     }
     else
     {
-        auto k = arm_compute::cpp14::make_unique<NEAccumulateWeightedKernel>();
+        auto k = arm_compute::support::cpp14::make_unique<NEAccumulateWeightedKernel>();
         k->configure(input, alpha, output);
         _kernel = std::move(k);
     }
@@ -55,7 +55,7 @@
 
 void NEAccumulateSquared::configure(const ITensor *input, uint32_t shift, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEAccumulateSquaredKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEAccumulateSquaredKernel>();
     k->configure(input, shift, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
index f5d81d7..57a1738 100644
--- a/src/runtime/NEON/functions/NEActivationLayer.cpp
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp
@@ -23,14 +23,14 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-void NEActivationLayer::configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
+void NEActivationLayer::configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
 {
-    auto k = arm_compute::cpp14::make_unique<NEActivationLayerKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEActivationLayerKernel>();
     k->configure(input, output, activation_info);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
index 50cc38b..11f5aa7 100644
--- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NEArithmeticAddition::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
 {
-    auto k = arm_compute::cpp14::make_unique<NEArithmeticAdditionKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEArithmeticAdditionKernel>();
     k->configure(input1, input2, output, policy);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
index a3d27c0..37586af 100644
--- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NEArithmeticSubtraction::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
 {
-    auto k = arm_compute::cpp14::make_unique<NEArithmeticSubtractionKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEArithmeticSubtractionKernel>();
     k->configure(input1, input2, output, policy);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
index a24429c..ef79b02 100644
--- a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
@@ -37,7 +37,7 @@
 {
 }
 
-void NEBatchNormalizationLayer::configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
+void NEBatchNormalizationLayer::configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
 {
     // Configure kernel
     _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
diff --git a/src/runtime/NEON/functions/NEBitwiseAnd.cpp b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
index 5aafc51..7982095 100644
--- a/src/runtime/NEON/functions/NEBitwiseAnd.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NEBitwiseAnd::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEBitwiseAndKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEBitwiseAndKernel>();
     k->configure(input1, input2, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEBitwiseNot.cpp b/src/runtime/NEON/functions/NEBitwiseNot.cpp
index af3df6e..c55957e 100644
--- a/src/runtime/NEON/functions/NEBitwiseNot.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseNot.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NEBitwiseNot::configure(const ITensor *input, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEBitwiseNotKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEBitwiseNotKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEBitwiseOr.cpp b/src/runtime/NEON/functions/NEBitwiseOr.cpp
index d12c5e5..01036da 100644
--- a/src/runtime/NEON/functions/NEBitwiseOr.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseOr.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NEBitwiseOr::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEBitwiseOrKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEBitwiseOrKernel>();
     k->configure(input1, input2, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEBitwiseXor.cpp b/src/runtime/NEON/functions/NEBitwiseXor.cpp
index 65c943e..4591698 100644
--- a/src/runtime/NEON/functions/NEBitwiseXor.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseXor.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseXor.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NEBitwiseXor::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEBitwiseXorKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEBitwiseXorKernel>();
     k->configure(input1, input2, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEBox3x3.cpp b/src/runtime/NEON/functions/NEBox3x3.cpp
index 7f0b45d..46cf259 100644
--- a/src/runtime/NEON/functions/NEBox3x3.cpp
+++ b/src/runtime/NEON/functions/NEBox3x3.cpp
@@ -23,9 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBox3x3.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEBox3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -35,13 +35,13 @@
 {
     if(use_fp16)
     {
-        auto k = arm_compute::cpp14::make_unique<NEBox3x3FP16Kernel>();
+        auto k = arm_compute::support::cpp14::make_unique<NEBox3x3FP16Kernel>();
         k->configure(input, output, border_mode == BorderMode::UNDEFINED);
         _kernel = std::move(k);
     }
     else
     {
-        auto k = arm_compute::cpp14::make_unique<NEBox3x3Kernel>();
+        auto k = arm_compute::support::cpp14::make_unique<NEBox3x3Kernel>();
         k->configure(input, output, border_mode == BorderMode::UNDEFINED);
         _kernel = std::move(k);
     }
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index 26f31f5..9be1df6 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -24,7 +24,6 @@
 #include "arm_compute/runtime/NEON/functions/NECannyEdge.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
@@ -35,14 +34,27 @@
 #include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
 #include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
 
 #include <cstring>
 #include <utility>
 
 using namespace arm_compute;
 
-NECannyEdge::NECannyEdge()
-    : _sobel(), _gradient(), _non_max_suppr(), _edge_trace(), _border_mag_gradient(), _border_edge_trace(), _gx(), _gy(), _magnitude(), _phase(), _nonmax(), _output(nullptr)
+NECannyEdge::NECannyEdge(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _sobel(),
+      _gradient(),
+      _non_max_suppr(),
+      _edge_trace(),
+      _border_mag_gradient(),
+      _border_edge_trace(),
+      _gx(),
+      _gy(),
+      _magnitude(),
+      _phase(),
+      _nonmax(),
+      _output(nullptr)
 {
 }
 
@@ -82,22 +94,26 @@
     _phase.allocator()->init(info);
     _nonmax.allocator()->init(info);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_gx);
+    _memory_group.manage(&_gy);
+
     // Configure/Init sobelNxN
     if(gradient_size == 3)
     {
-        auto k = arm_compute::cpp14::make_unique<NESobel3x3>();
+        auto k = arm_compute::support::cpp14::make_unique<NESobel3x3>();
         k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
         _sobel = std::move(k);
     }
     else if(gradient_size == 5)
     {
-        auto k = arm_compute::cpp14::make_unique<NESobel5x5>();
+        auto k = arm_compute::support::cpp14::make_unique<NESobel5x5>();
         k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
         _sobel = std::move(k);
     }
     else if(gradient_size == 7)
     {
-        auto k = arm_compute::cpp14::make_unique<NESobel7x7>();
+        auto k = arm_compute::support::cpp14::make_unique<NESobel7x7>();
         k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
         _sobel = std::move(k);
     }
@@ -106,20 +122,31 @@
         ARM_COMPUTE_ERROR("Gradient size not supported\n");
     }
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_magnitude);
+    _memory_group.manage(&_phase);
+
     // Configure gradient
     if(use_fp16)
     {
-        auto k = arm_compute::cpp14::make_unique<NEGradientFP16Kernel>();
+        auto k = arm_compute::support::cpp14::make_unique<NEGradientFP16Kernel>();
         k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
         _gradient = std::move(k);
     }
     else
     {
-        auto k = arm_compute::cpp14::make_unique<NEGradientKernel>();
+        auto k = arm_compute::support::cpp14::make_unique<NEGradientKernel>();
         k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
         _gradient = std::move(k);
     }
 
+    // Allocate intermediate tensors
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+
+    // Manage intermediate buffers
+    _memory_group.manage(&_nonmax);
+
     // Configure non-maxima suppression
     _non_max_suppr.configure(&_magnitude, &_phase, &_nonmax, upper_thr, lower_thr, border_mode == BorderMode::UNDEFINED);
 
@@ -127,6 +154,10 @@
     // it. If border mode is undefined filling the border is a nop.
     _border_mag_gradient.configure(&_magnitude, _non_max_suppr.border_size(), border_mode, constant_border_value);
 
+    // Allocate intermediate tensors
+    _phase.allocator()->allocate();
+    _magnitude.allocator()->allocate();
+
     // Configure edge tracing
     _edge_trace.configure(&_nonmax, output);
 
@@ -134,10 +165,6 @@
     _border_edge_trace.configure(&_nonmax, _edge_trace.border_size(), BorderMode::CONSTANT, 0);
 
     // Allocate intermediate tensors
-    _gx.allocator()->allocate();
-    _gy.allocator()->allocate();
-    _phase.allocator()->allocate();
-    _magnitude.allocator()->allocate();
     _nonmax.allocator()->allocate();
 }
 
@@ -146,11 +173,13 @@
     ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
     ARM_COMPUTE_ERROR_ON(_output == nullptr);
 
+    _memory_group.acquire();
+
     // Run sobelNxN
     _sobel->run();
 
     // Fill border before non-maxima suppression. Nop for border mode undefined.
-    _border_mag_gradient.run(_border_mag_gradient.window());
+    NEScheduler::get().schedule(&_border_mag_gradient, Window::DimZ);
 
     // Run gradient
     NEScheduler::get().schedule(_gradient.get(), Window::DimY);
@@ -162,8 +191,10 @@
     memset(_output->buffer(), 0, _output->info()->total_size());
 
     // Fill border before edge trace
-    _border_edge_trace.run(_border_edge_trace.window());
+    NEScheduler::get().schedule(&_border_edge_trace, Window::DimZ);
 
     // Run edge tracing
-    _edge_trace.run(_edge_trace.window());
+    NEScheduler::get().schedule(&_edge_trace, Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEChannelCombine.cpp b/src/runtime/NEON/functions/NEChannelCombine.cpp
index 84d4fff..9166aa9 100644
--- a/src/runtime/NEON/functions/NEChannelCombine.cpp
+++ b/src/runtime/NEON/functions/NEChannelCombine.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEChannelCombine.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEChannelCombineKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,14 +32,14 @@
 
 void NEChannelCombine::configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEChannelCombineKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEChannelCombineKernel>();
     k->configure(plane0, plane1, plane2, plane3, output);
     _kernel = std::move(k);
 }
 
 void NEChannelCombine::configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEChannelCombineKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEChannelCombineKernel>();
     k->configure(plane0, plane1, plane2, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEChannelExtract.cpp b/src/runtime/NEON/functions/NEChannelExtract.cpp
index 634e918..7b8a993 100644
--- a/src/runtime/NEON/functions/NEChannelExtract.cpp
+++ b/src/runtime/NEON/functions/NEChannelExtract.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEChannelExtract.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEChannelExtractKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,14 +32,14 @@
 
 void NEChannelExtract::configure(const ITensor *input, Channel channel, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEChannelExtractKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEChannelExtractKernel>();
     k->configure(input, channel, output);
     _kernel = std::move(k);
 }
 
 void NEChannelExtract::configure(const IMultiImage *input, Channel channel, IImage *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEChannelExtractKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEChannelExtractKernel>();
     k->configure(input, channel, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEColorConvert.cpp b/src/runtime/NEON/functions/NEColorConvert.cpp
index bbaa832..b9fe1ff 100644
--- a/src/runtime/NEON/functions/NEColorConvert.cpp
+++ b/src/runtime/NEON/functions/NEColorConvert.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEColorConvert.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,28 +32,28 @@
 
 void NEColorConvert::configure(const ITensor *input, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEColorConvertKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
 
 void NEColorConvert::configure(const IMultiImage *input, IImage *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEColorConvertKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
 
 void NEColorConvert::configure(const IImage *input, IMultiImage *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEColorConvertKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
 
 void NEColorConvert::configure(const IMultiImage *input, IMultiImage *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEColorConvertKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp
index 3f39ae2..f10ffa6 100644
--- a/src/runtime/NEON/functions/NEConvolution.cpp
+++ b/src/runtime/NEON/functions/NEConvolution.cpp
@@ -24,7 +24,6 @@
 #include "arm_compute/runtime/NEON/functions/NEConvolution.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
 #include "arm_compute/core/PixelValue.h"
@@ -33,6 +32,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
 
 #include <array>
 #include <utility>
@@ -41,15 +41,15 @@
 
 void NEConvolution3x3::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<NEConvolution3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEConvolution3x3Kernel>();
     k->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
 
 template <unsigned int matrix_size>
-NEConvolutionSquare<matrix_size>::NEConvolutionSquare()
-    : _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
+NEConvolutionSquare<matrix_size>::NEConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
 {
 }
 
@@ -72,6 +72,10 @@
 
         _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, intermediate_type));
 
+        // Manage intermediate buffers
+        _memory_group.manage(&_tmp);
+
+        // Calculate scale
         if(scale == 0)
         {
             scale = calculate_matrix_scale(conv, matrix_size);
@@ -94,12 +98,16 @@
 template <unsigned int matrix_size>
 void                   NEConvolutionSquare<matrix_size>::run()
 {
-    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
 
     if(_is_separable)
     {
+        _memory_group.acquire();
+
         NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
         NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
+
+        _memory_group.release();
     }
     else
     {
@@ -113,7 +121,7 @@
 
 void NEConvolutionRectangle::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<NEConvolutionRectangleKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEConvolutionRectangleKernel>();
     k->configure(input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index bd688cf..40862fc 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -23,32 +23,41 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
 
+#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
+} // namespace arm_compute
 
 #include <cmath>
 #include <tuple>
 
-using namespace arm_compute;
-
-NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights()
-    : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
+namespace arm_compute
+{
+NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
 {
 }
 
 void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose1xW)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
 
     if(biases != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
         ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases);
         ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
@@ -69,8 +78,11 @@
         TensorInfo         info_wr(shape_wr, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
 
         _weights_reshaped.allocator()->init(info_wr);
+        _memory_group.manage(&_weights_reshaped);
+
         _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
         _weights_transposed_kernel.configure(&_weights_reshaped, output);
+
         _weights_reshaped.allocator()->allocate();
     }
     else
@@ -81,32 +93,34 @@
 
 void NEConvolutionLayerReshapeWeights::run()
 {
+    _memory_group.acquire();
+
     NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+
     if(_transpose1xW)
     {
         NEScheduler::get().schedule(&_weights_transposed_kernel, Window::DimY);
     }
+
+    _memory_group.release();
 }
 
-NEConvolutionLayer::NEConvolutionLayer()
-    : _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(),
-      _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_optimised_kernel(nullptr), _output_col2im_kernel(),
+      _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _workspace(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
 {
 }
 
 void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
     ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
 
     if(biases != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
         ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
         ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
@@ -131,94 +145,165 @@
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
 
-    const unsigned int kernel_width = (_are_weights_reshaped) ? weights_info.kernel_size() : weights->info()->dimension(0);
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width,
-                                                 stride_x, stride_y, pad_x, pad_y, conv_info.round());
-    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+    const unsigned int kernel_width  = (_are_weights_reshaped) ? weights_info.kernel_size().first : weights->info()->dimension(0);
+    const unsigned int kernel_height = (_are_weights_reshaped) ? weights_info.kernel_size().second : weights->info()->dimension(1);
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
+                                                 conv_info);
 
-    // Check if its a "fully connected" convolution
+    // Check if its a "fully connected" convolution, i.e. the output size is 1x1xnum_kernels
     _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
 
+#if defined(__arm__)
+    if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32)
+    {
+        _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
+    }
+#elif defined(__aarch64__)
+    if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32)
+    {
+        _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
+    }
+#endif /* defined(__arm__) || defined(__aarch64__) */
+
     unsigned int mat_weights_cols = weights->info()->dimension(3);
     unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
 
     // Reshape weights if needed
-    if(_are_weights_reshaped)
+    if(_mm_optimised_kernel != nullptr)
     {
-        mat_weights_cols                         = output->info()->dimension(2);
-        const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
-        mat_weights_rows                         = (_has_bias ? 1 + quarter_reshaped_cols : quarter_reshaped_cols);
-    }
-    else
-    {
-        if(_is_fully_connected_convolution)
+        if(_are_weights_reshaped)
         {
-            // Create tensor to store the reshaped weights
-            TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
-            TensorInfo  info_wr(shape_wr, 1, dt, fixed_point_position);
-            _weights_reshaped.allocator()->init(info_wr);
-            _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
+            mat_weights_cols = weights_info.num_kernels();
+            mat_weights_rows = weights->info()->dimension(1);
         }
         else
         {
-            // Create tensor to store transposed weights
-            const float transpose_width = 16.0f / input->info()->element_size();
-            TensorShape shape_wt(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
-            TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
-            _weights_reshaped.allocator()->init(info_wt);
-            _reshape_weights.configure(weights, biases, &_weights_reshaped, true /* 1xW transpose */);
+            TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
+
+            // Create tensor to store the reshaped weights
+            _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
+            weights = &_weights_reshaped;
         }
-        weights = &_weights_reshaped;
+    }
+    else
+    {
+        if(_are_weights_reshaped)
+        {
+            mat_weights_cols = weights_info.num_kernels();
+            mat_weights_rows = weights->info()->dimension(0) / 4 + (_has_bias ? 1 : 0);
+        }
+        else
+        {
+            TensorShape reshaped_weights_shape;
+
+            if(_is_fully_connected_convolution)
+            {
+                reshaped_weights_shape = TensorShape{ mat_weights_cols, mat_weights_rows };
+            }
+            else
+            {
+                // Create tensor to store transposed weights
+                const float transpose_width = 16.0f / input->info()->element_size();
+                reshaped_weights_shape      = TensorShape{ mat_weights_rows *static_cast<unsigned int>(transpose_width),
+                                                           static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)) };
+            }
+
+            // Create tensor to store the reshaped weights
+            _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, !_is_fully_connected_convolution /* 1xW transpose */);
+            weights = &_weights_reshaped;
+        }
     }
 
     // Create tensor to store im2col reshaped inputs
     const unsigned int mat_input_cols = mat_weights_rows;
     const unsigned int mat_input_rows = conv_w * conv_h;
-    TensorShape        shape_im2col   = input->info()->tensor_shape();
+
+    TensorShape shape_im2col(input->info()->tensor_shape());
     shape_im2col.set(0, mat_input_cols);
     shape_im2col.set(1, mat_input_rows);
     shape_im2col.set(2, 1);
     _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+    _memory_group.manage(&_input_im2col_reshaped);
 
     // Create tensor (interleave) to prepare input tensor for GEMM
-    if(!_is_fully_connected_convolution)
+    if(!_is_fully_connected_convolution && _mm_optimised_kernel == nullptr)
     {
-        TensorShape shape_interleaved = shape_im2col;
+        TensorShape shape_interleaved(shape_im2col);
         shape_interleaved.set(0, shape_interleaved.x() * 4);
         shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
         _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+        _memory_group.manage(&_input_interleaved_reshaped);
     }
 
     // Create GEMM output tensor
-    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+    TensorShape shape_gemm(_input_im2col_reshaped.info()->tensor_shape());
     shape_gemm.set(0, mat_weights_cols);
     shape_gemm.set(1, mat_input_rows);
     _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, dt, fixed_point_position));
+    _memory_group.manage(&_gemm_output);
 
     // Configure kernels
-    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
-    if(_is_fully_connected_convolution)
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
+
+#if defined(__arm__) || defined(__aarch64__)
+    if(_mm_optimised_kernel != nullptr)
     {
-        _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
+        struct CPUInfo ci = NEScheduler::get().cpu_info();
+
+        const int M = _gemm_output.info()->tensor_shape().y();
+        const int N = _gemm_output.info()->tensor_shape().x();
+        const int K = _input_im2col_reshaped.info()->tensor_shape().x();
+
+#if defined(__arm__)
+        GemmInterleaved<sgemm_8x6, float, float> gemm(&ci, M, N, K, false, false);
+#elif defined(__aarch64__)
+        GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
+#endif /* defined(__arm__) || defined(__aarch64__) */
+
+        constexpr size_t alignment = 4096;
+        _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+        _memory_group.manage(&_workspace);
+
+        // Configure matrix multiplication kernel
+        if(_is_fully_connected_convolution)
+        {
+            _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace, 1.f, 0.f, false, false);
+        }
+        else
+        {
+            _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace);
+        }
+
+        _workspace.allocator()->allocate();
     }
     else
+#endif /* defined(__arm__) || defined(__aarch64__) */
     {
-        _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
-        _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
+        if(_is_fully_connected_convolution)
+        {
+            _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
+        }
+        else
+        {
+            _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
+            _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
+            _input_interleaved_reshaped.allocator()->allocate();
+        }
     }
+
+    _input_im2col_reshaped.allocator()->allocate();
     _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+    _gemm_output.allocator()->allocate();
+
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
 
     // Allocate intermediate tensor
     if(!_are_weights_reshaped)
     {
         _weights_reshaped.allocator()->allocate();
     }
-    _input_im2col_reshaped.allocator()->allocate();
-    if(!_is_fully_connected_convolution)
-    {
-        _input_interleaved_reshaped.allocator()->allocate();
-    }
-    _gemm_output.allocator()->allocate();
 }
 
 void NEConvolutionLayer::run()
@@ -230,17 +315,30 @@
         _reshape_weights.run();
     }
 
+    _memory_group.acquire();
+
     // Run input reshaping
     NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
-    if(!_is_fully_connected_convolution)
-    {
-        // Run interleave
-        NEScheduler::get().schedule(&_input_interleave_kernel, Window::DimY);
-    }
 
     // Runs matrix multiply on reshaped matrices
-    NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+    if(_mm_optimised_kernel != nullptr)
+    {
+        NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
+    }
+    else
+    {
+        if(!_is_fully_connected_convolution)
+        {
+            // Run interleave
+            NEScheduler::get().schedule(&_input_interleave_kernel, Window::DimY);
+        }
+
+        NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+    }
 
     // Reshape output matrix
     NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+
+    _memory_group.release();
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDepthConcatenate.cpp b/src/runtime/NEON/functions/NEDepthConcatenate.cpp
index 7d2c549..ddf7e90 100644
--- a/src/runtime/NEON/functions/NEDepthConcatenate.cpp
+++ b/src/runtime/NEON/functions/NEDepthConcatenate.cpp
@@ -24,28 +24,29 @@
 #include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-NEDepthConcatenate::NEDepthConcatenate()
-    : _inputs_vector(), _concat_kernels_vector(), _border_handlers_vector(), _num_inputs(0)
+NEDepthConcatenate::NEDepthConcatenate() // NOLINT
+    : _inputs_vector(),
+      _concat_kernels_vector(),
+      _border_handlers_vector(),
+      _num_inputs(0)
 {
 }
 
-void NEDepthConcatenate::configure(std::vector<ITensor *> inputs_vector, ITensor *output)
+void NEDepthConcatenate::configure(std::vector<ITensor *> inputs_vector, ITensor *output) // NOLINT
 {
     ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
 
     _num_inputs             = inputs_vector.size();
-    _concat_kernels_vector  = arm_compute::cpp14::make_unique<NEDepthConcatenateKernel[]>(_num_inputs);
-    _border_handlers_vector = arm_compute::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
+    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<NEDepthConcatenateKernel[]>(_num_inputs);
+    _border_handlers_vector = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
 
     unsigned int depth_offset = 0;
     for(unsigned int i = 0; i < _num_inputs; ++i)
diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/NEON/functions/NEDepthConvert.cpp
index a339cae..37857b6 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvert.cpp
@@ -23,22 +23,16 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
 
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
 using namespace arm_compute;
 
-void NEDepthConvert::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(input == output);
-    ARM_COMPUTE_ERROR_ON(input->info()->data_type() == output->info()->data_type());
-
-    auto k = arm_compute::cpp14::make_unique<NEDepthConvertKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
     k->configure(input, output, policy, shift);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
new file mode 100644
index 0000000..a58b6e4
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEDequantizationLayer::NEDequantizationLayer()
+    : _dequantize_kernel()
+{
+}
+
+void NEDequantizationLayer::configure(const ITensor *input, ITensor *output, const ITensor *min_max)
+{
+    // Configure kernel
+    _dequantize_kernel.configure(input, output, min_max);
+}
+
+void NEDequantizationLayer::run()
+{
+    NEScheduler::get().schedule(&_dequantize_kernel, Window::DimY);
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDerivative.cpp b/src/runtime/NEON/functions/NEDerivative.cpp
index 2887c13..8118030 100644
--- a/src/runtime/NEON/functions/NEDerivative.cpp
+++ b/src/runtime/NEON/functions/NEDerivative.cpp
@@ -42,11 +42,11 @@
     ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
 
     _kernel.configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }
 
 void NEDerivative::run()
 {
-    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
     NEScheduler::get().schedule(&_kernel, Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NEDilate.cpp b/src/runtime/NEON/functions/NEDilate.cpp
index 0c016f1..5c733a8 100644
--- a/src/runtime/NEON/functions/NEDilate.cpp
+++ b/src/runtime/NEON/functions/NEDilate.cpp
@@ -23,9 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEDilate.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEDilateKernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void NEDilate::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<NEDilateKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEDilateKernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 3f3e771..b831a6a 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -33,15 +33,13 @@
 
 using namespace arm_compute;
 
-NEDirectConvolutionLayer::NEDirectConvolutionLayer()
-    : _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator()
+NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator()
 {
 }
 
 void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
-
     // Free accumulator
     if(_accumulator.buffer() != nullptr)
     {
@@ -49,17 +47,38 @@
     }
 
     // Allocate the intermediate accumulator tensor in case of fixed point input
-    if(output->info()->data_type() == DataType::QS8)
+    switch(output->info()->data_type())
     {
-        _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position()));
-        _conv_kernel.configure(input, weights, &_accumulator, conv_info);
-        _accumulate_bias_kernel.configure(&_accumulator, bias, output);
-        _accumulator.allocator()->allocate();
-    }
-    else
-    {
-        _conv_kernel.configure(input, weights, output, conv_info);
-        _accumulate_bias_kernel.configure(output, bias);
+        case DataType::QS8:
+        {
+            _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position()));
+            _memory_group.manage(&_accumulator);
+            _conv_kernel.configure(input, weights, &_accumulator, conv_info);
+            _accumulate_bias_kernel.configure(&_accumulator, bias, output);
+            _accumulator.allocator()->allocate();
+            break;
+        }
+        case DataType::QS16:
+        {
+            _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS32, output->info()->fixed_point_position()));
+            _memory_group.manage(&_accumulator);
+            _conv_kernel.configure(input, weights, &_accumulator, conv_info);
+            _accumulate_bias_kernel.configure(&_accumulator, bias, output);
+            _accumulator.allocator()->allocate();
+            break;
+        }
+        case DataType::F16:
+        case DataType::F32:
+        {
+            _conv_kernel.configure(input, weights, output, conv_info);
+            _accumulate_bias_kernel.configure(output, bias);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Data type not supported");
+            break;
+        }
     }
 
     // Add zero padding XY
@@ -68,8 +87,12 @@
 
 void NEDirectConvolutionLayer::run()
 {
-    _input_border_handler.run(_input_border_handler.window());
+    NEScheduler::get().schedule(&_input_border_handler, Window::DimZ);
+
+    _memory_group.acquire();
 
     NEScheduler::get().schedule(&_conv_kernel, Window::DimZ);
     NEScheduler::get().schedule(&_accumulate_bias_kernel, Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
index f6ec677..70b93ca 100644
--- a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
+++ b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
@@ -55,7 +55,7 @@
     NEScheduler::get().schedule(&_histogram_kernel, Window::DimY);
 
     // Calculate cumulative distribution of histogram and create LUT.
-    _cd_histogram_kernel.run(_cd_histogram_kernel.window());
+    NEScheduler::get().schedule(&_cd_histogram_kernel, Window::DimY);
 
     // Map input to output using created LUT.
     NEScheduler::get().schedule(&_map_histogram_kernel, Window::DimY);
diff --git a/src/runtime/NEON/functions/NEErode.cpp b/src/runtime/NEON/functions/NEErode.cpp
index 9b011db..3609572 100644
--- a/src/runtime/NEON/functions/NEErode.cpp
+++ b/src/runtime/NEON/functions/NEErode.cpp
@@ -23,9 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEErode.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEErodeKernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void NEErode::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<NEErodeKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEErodeKernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/NEON/functions/NEFastCorners.cpp b/src/runtime/NEON/functions/NEFastCorners.cpp
index 33a58f1..4137b1d 100644
--- a/src/runtime/NEON/functions/NEFastCorners.cpp
+++ b/src/runtime/NEON/functions/NEFastCorners.cpp
@@ -35,8 +35,9 @@
 
 using namespace arm_compute;
 
-NEFastCorners::NEFastCorners()
-    : _fast_corners_kernel(),
+NEFastCorners::NEFastCorners(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)),
+      _fast_corners_kernel(),
       _border_handler(),
       _nonmax_kernel(),
       _fill_kernel(),
@@ -59,6 +60,7 @@
 
     TensorInfo tensor_info(input->info()->tensor_shape(), Format::U8);
     _output.allocator()->init(tensor_info);
+    _memory_group.manage(&_output);
 
     // If border is UNDEFINED _fast_corners_kernel will operate in xwindow (3,
     // width - 3) and ywindow (3, height -3) so the output image will leave the
@@ -75,6 +77,7 @@
     else
     {
         _suppressed.allocator()->init(tensor_info);
+        _memory_group.manage(&_suppressed);
         _nonmax_kernel.configure(&_output, &_suppressed, BorderMode::UNDEFINED == border_mode);
         _fill_kernel.configure(&_suppressed, 1 /* we keep all texels >0 */, corners);
 
@@ -88,7 +91,9 @@
 
 void NEFastCorners::run()
 {
-    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+
+    _memory_group.acquire();
 
     NEScheduler::get().schedule(&_fast_corners_kernel, Window::DimY);
 
@@ -98,4 +103,6 @@
     }
 
     NEScheduler::get().schedule(&_fill_kernel, Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp
index e884f4a..44e4952 100644
--- a/src/runtime/NEON/functions/NEFillBorder.cpp
+++ b/src/runtime/NEON/functions/NEFillBorder.cpp
@@ -30,7 +30,7 @@
 
 void NEFillBorder::configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
 {
-    _border_handler.configure(input, border_width, border_mode, constant_border_value);
+    _border_handler.configure(input, BorderSize(border_width), border_mode, constant_border_value);
 }
 
 void NEFillBorder::run()
diff --git a/src/runtime/NEON/functions/NEFloor.cpp b/src/runtime/NEON/functions/NEFloor.cpp
new file mode 100644
index 0000000..0000cdd
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFloor.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFloor.h"
+
+#include "arm_compute/core/NEON/kernels/NEFloorKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void NEFloor::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEFloorKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index abb41e9..2e8d105 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -23,27 +23,28 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 
+#include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include <algorithm>
 #include <cmath>
 
-using namespace arm_compute;
-
-NEFullyConnectedLayerReshapeWeights::NEFullyConnectedLayerReshapeWeights()
-    : _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
+namespace arm_compute
+{
+NEFullyConnectedLayerReshapeWeights::NEFullyConnectedLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
 {
 }
 
 void NEFullyConnectedLayerReshapeWeights::configure(const ITensor *input, ITensor *output, bool transpose_weights, bool is_batched_fc_layer)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 2);
     ARM_COMPUTE_ERROR_ON(output == nullptr);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != 2);
-    ARM_COMPUTE_ERROR_ON((transpose_weights == false) && (is_batched_fc_layer == false));
+    ARM_COMPUTE_ERROR_ON(!transpose_weights && !is_batched_fc_layer);
 
-    const DataType dt                   = input->info()->data_type();
+    const DataType data_type            = input->info()->data_type();
     const int      fixed_point_position = input->info()->fixed_point_position();
 
     _transpose_weights   = transpose_weights;
@@ -56,7 +57,8 @@
         {
             // Initialize the output tensor for transpose
             TensorShape shape_transposed(input->info()->dimension(1), input->info()->dimension(0));
-            _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, dt, fixed_point_position));
+            _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, data_type, fixed_point_position));
+            _memory_group.manage(&_transpose_output);
             _transpose_kernel.configure(input, &_transpose_output);
 
             // Configure transpose 1xW kernel
@@ -86,229 +88,161 @@
 
 void NEFullyConnectedLayerReshapeWeights::run()
 {
+    _memory_group.acquire();
+
     if(_transpose_weights)
     {
         NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
     }
+
     if(_is_batched_fc_layer)
     {
         NEScheduler::get().schedule(&_transpose1xW_kernel, Window::DimY);
     }
+
+    _memory_group.release();
 }
 
-NEFullyConnectedLayer::NEFullyConnectedLayer()
-    : _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _reshape_weights_output(),
-      _are_weights_reshaped(false), _is_fc_after_conv(false), _is_batched_fc_layer(false), _accumulate_biases(false)
+NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(),
+      _reshape_weights_output(), _are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false)
 {
 }
 
-void NEFullyConnectedLayer::configure_conv_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2) * (16 / weights->info()->element_size())));
-
-    const DataType dt                   = input->info()->data_type();
-    const int      fixed_point_position = input->info()->fixed_point_position();
-
-    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
-
-    // Initialize output tensor for im2col
-    TensorShape shape_im2col;
-    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
-    shape_im2col.set(1, input->info()->dimension(3));
-    shape_im2col.set(2, input->info()->dimension(4));
-    shape_im2col.set(3, input->info()->dimension(5));
-    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
-
-    // Initialize output tensor for interleave 4x4
-    TensorShape shape_interleaved = _im2col_output.info()->tensor_shape();
-    shape_interleaved.set(0, shape_interleaved.x() * 4);
-    shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
-    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
-
-    // Configure im2col kernel
-    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
-
-    // Configure interleave4x4 kernel
-    _interleave4x4_kernel.configure(&_im2col_output, &_interleave4x4_output);
-
-    // Configure matrix multiply kernel
-    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
-
-    // Allocate the tensors once all the configure methods have been called
-    _im2col_output.allocator()->allocate();
-    _interleave4x4_output.allocator()->allocate();
-}
-
-void NEFullyConnectedLayer::configure_fc_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output)
-{
-    const DataType dt                   = input->info()->data_type();
-    const int      fixed_point_position = input->info()->fixed_point_position();
-
-    // Initialize output tensor for interleave 4x4
-    TensorShape shape_interleaved = input->info()->tensor_shape();
-    shape_interleaved.set(0, shape_interleaved.x() * 4);
-    shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
-    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
-
-    // Configure interleave4x4 kernel
-    _interleave4x4_kernel.configure(input, &_interleave4x4_output);
-
-    // Configure matrix multiply kernel
-    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
-
-    // Allocate the tensors once all the configure methods have been called
-    _interleave4x4_output.allocator()->allocate();
-}
-
-void NEFullyConnectedLayer::configure_conv_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
-
-    const DataType dt                   = input->info()->data_type();
-    const int      fixed_point_position = input->info()->fixed_point_position();
-
-    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
-
-    // Initialize output tensor for im2col
-    TensorShape shape_im2col;
-    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
-    shape_im2col.set(1, 1);
-    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
-
-    // Configure im2col kernel
-    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
-
-    // Configure matrix multiply kernel
-    _mm_kernel.configure(&_im2col_output, weights, output, 1.0f);
-
-    // Allocate the output tensor for im2col once all the configure methods have been called
-    _im2col_output.allocator()->allocate();
-}
-
-void NEFullyConnectedLayer::configure_fc_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
-
-    // Configure matrix multiply kernel
-    _mm_kernel.configure(input, weights, output, 1.0f);
-}
-
 void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights, bool are_weights_reshaped)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
-    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
-
-    const DataType dt                   = input->info()->data_type();
-    const int      fixed_point_position = input->info()->fixed_point_position();
-
-    _are_weights_reshaped = are_weights_reshaped;
-    _is_fc_after_conv     = true;
-    _is_batched_fc_layer  = false;
-    _accumulate_biases    = false;
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-
-        _accumulate_biases = true;
-
-        // Configure accumulate biases kernel
-        _accumulate_biases_kernel.configure(output, biases);
-    }
-
     // With the Fully Connected layer we can have 4 different cases:
     //  1) Convolution layer -> Fully Connected layer without batches
     //  2) Fully Connected layer -> Fully Connected layer without batches
     //  3) Convolution layer -> Fully Connected layer with batches
     //  4) Fully Connected layer -> Fully Connected layer with batches
 
-    // Check if we have a fully connected layer with batches
-    _is_batched_fc_layer = (output->info()->dimension(1) > 1);
+    // Expected shape before transpose and reshaping
+    // Input: In x B (In and B can be multi-dimensional)
+    // Weights: flat(In) x Out
+    // Biases: Out
+    // Output: Out x B (B can be multi-dimensional)
 
-    const ITensor *weights_to_use = weights;
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, weights, output);
 
-    if(!are_weights_reshaped)
+    const DataType data_type            = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+    const int      num_batch_dimensions = std::max(0, static_cast<int>(output->info()->tensor_shape().num_dimensions()) - 1);
+    const int      num_input_dimensions = input->info()->tensor_shape().num_dimensions() - num_batch_dimensions;
+    const size_t   linear_input_size    = input->info()->tensor_shape().total_size_lower(num_input_dimensions);
+
+    _linearize_input      = input->info()->tensor_shape().x() != linear_input_size;
+    _are_weights_reshaped = are_weights_reshaped;
+    _accumulate_biases    = biases != nullptr;
+    _is_batched_fc_layer  = num_batch_dimensions > 0;
+
+    // Check if number of batches match
+    ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().total_size_upper(num_input_dimensions) != output->info()->tensor_shape().total_size_upper(1));
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 2);
+
+    const size_t   interleave_width = 16 / input->info()->element_size();
+    const ITensor *weights_to_use   = weights;
+
+    if(!are_weights_reshaped && (transpose_weights || _is_batched_fc_layer))
     {
-        if((transpose_weights || _is_batched_fc_layer))
+        weights_to_use = &_reshape_weights_output;
+
+        TensorShape reshaped_weights_shape(weights->info()->tensor_shape());
+
+        // Transpose weights if the user hasn't done it
+        if(transpose_weights)
         {
-            weights_to_use = &_reshape_weights_output;
-
-            if(transpose_weights)
-            {
-                if(_is_batched_fc_layer)
-                {
-                    const float transpose_width = 16.0f / input->info()->element_size();
-                    TensorShape shape_wt(weights->info()->dimension(0) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(1) / transpose_width)));
-                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
-                    _reshape_weights_output.allocator()->init(info_wt);
-                }
-                else
-                {
-                    TensorShape shape_wt(weights->info()->dimension(1), weights->info()->dimension(0));
-                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
-                    _reshape_weights_output.allocator()->init(info_wt);
-                }
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR_ON(!_is_batched_fc_layer);
-
-                const float transpose_width = 16.0f / input->info()->element_size();
-                TensorShape shape_wt(weights->info()->dimension(1) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(0) / transpose_width)));
-                TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
-                _reshape_weights_output.allocator()->init(info_wt);
-            }
-
-            // Reshape the weights
-            _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
+            const size_t shape_x = reshaped_weights_shape.x();
+            reshaped_weights_shape.set(0, reshaped_weights_shape.y());
+            reshaped_weights_shape.set(1, shape_x);
         }
+
+        // If the we run multiple batches we need 1xW transpose, too.
+        if(_is_batched_fc_layer)
+        {
+            const float shape_x = reshaped_weights_shape.x();
+            reshaped_weights_shape.set(0, reshaped_weights_shape.y() * interleave_width);
+            reshaped_weights_shape.set(1, static_cast<unsigned int>(std::ceil(shape_x / interleave_width)));
+        }
+
+        _reshape_weights_output.allocator()->init(TensorInfo(reshaped_weights_shape, 1, data_type, fixed_point_position));
+
+        // Reshape the weights
+        _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
+    }
+
+    // Check correct shape of weights
+    if(_is_batched_fc_layer)
+    {
+        // Transpose + Transpose1xW
+        ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().x() != linear_input_size * interleave_width);
+        ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().y() != static_cast<unsigned int>(std::ceil(static_cast<float>(output->info()->tensor_shape().x()) / interleave_width)));
+    }
+    else
+    {
+        // Transpose
+        ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().x() != output->info()->tensor_shape().x());
+        ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().y() != linear_input_size);
+    }
+
+    const ITensor *multiply_input = input;
+
+    if(_linearize_input)
+    {
+        TensorShape shape_im2col(input->info()->tensor_shape());
+        shape_im2col.collapse(num_input_dimensions);
+        _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, data_type, fixed_point_position));
+
+        // Configure im2col kernel
+        _memory_group.manage(&_im2col_output);
+        _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+
+        multiply_input = &_im2col_output;
     }
 
     if(_is_batched_fc_layer)
     {
-        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
-                                                                                  input->info()->tensor_shape().cend(),
-                                                                                  output->info()->tensor_shape().cbegin() + 1));
+        TensorShape shape_interleaved(multiply_input->info()->tensor_shape());
+        shape_interleaved.set(0, shape_interleaved.x() * 4);
+        shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
+        _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, data_type, fixed_point_position));
 
-        if(_is_fc_after_conv)
-        {
-            // Fully Connected layer after a Convolution Layer with batches
-            configure_conv_fc_wb(input, weights_to_use, output);
-        }
-        else
-        {
-            // Fully Connected layer after a Fully Connected Layer with batches
-            configure_fc_fc_wb(input, weights_to_use, output);
-        }
+        // Configure interleave4x4 kernel
+        _memory_group.manage(&_interleave4x4_output);
+        _interleave4x4_kernel.configure(multiply_input, &_interleave4x4_output);
+
+        multiply_input = &_interleave4x4_output;
     }
-    else
-    {
-        // In case of not batched fully connected layer, the weights will not be reshaped using transposed1xW
-        _is_fc_after_conv = ((weights_to_use->info()->dimension(1)) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
 
-        if(_is_fc_after_conv)
-        {
-            // Fully Connected layer after a Convolution Layer without batches
-            configure_conv_fc_nb(input, weights_to_use, output);
-        }
-        else
-        {
-            // Fully Connected layer after a Fully Connected Layer without batches
-            configure_fc_fc_nb(input, weights_to_use, output);
-        }
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(multiply_input, weights_to_use, output, 1.0f);
+
+    if(_accumulate_biases)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->tensor_shape().x() != output->info()->tensor_shape().x());
+
+        // Configure accumulate biases kernel
+        _accumulate_biases_kernel.configure(output, biases);
     }
 
     // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
-    if(!are_weights_reshaped)
+    if(!are_weights_reshaped && (transpose_weights || _is_batched_fc_layer))
     {
-        if(transpose_weights || _is_batched_fc_layer)
-        {
-            // Allocate the tensor for the weights reshaped
-            _reshape_weights_output.allocator()->allocate();
-        }
+        // Allocate the tensor for the weights reshaped
+        _reshape_weights_output.allocator()->allocate();
+    }
+
+    if(_linearize_input)
+    {
+        _im2col_output.allocator()->allocate();
+    }
+
+    if(_is_batched_fc_layer)
+    {
+        _interleave4x4_output.allocator()->allocate();
     }
 }
 
@@ -321,8 +255,10 @@
         _reshape_weights_kernel.run();
     }
 
-    // Linearize input if comes from a convolutional layer
-    if(_is_fc_after_conv)
+    _memory_group.acquire();
+
+    // Linearize input if it comes from a convolutional layer
+    if(_linearize_input)
     {
         NEScheduler::get().schedule(&_im2col_kernel, Window::DimY);
     }
@@ -341,4 +277,7 @@
     {
         NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
     }
+
+    _memory_group.release();
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 15d5f4e..ff92ef8 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -26,30 +26,41 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
+} // namespace arm_compute
 
 #include <cmath>
 
-using namespace arm_compute;
-
-NEGEMM::NEGEMM()
-    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _run_vector_matrix_multiplication(false), _run_addition(false)
+namespace arm_compute
+{
+NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _mm_optimised_kernel(nullptr), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(),
+      _run_vector_matrix_multiplication(false), _run_addition(false)
 {
 }
 
 void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16, DataType::QS8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::F32, DataType::F16, DataType::QS8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(d, 1, DataType::F32, DataType::F16, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16, DataType::QS8, DataType::QS16);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, d);
+    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
 
     if(c != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16, DataType::QS8);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16, DataType::QS8, DataType::QS16);
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
         ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
         ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
@@ -57,100 +68,135 @@
         ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != d->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
     }
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, d);
-    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
 
-    // Check if the first input tensor is a vector. If so, all the kernels for reshaping the tensors can be skipped
-    if((a->info()->dimension(1) == 1))
+    // Check if the first input tensor is a vector.
+    // If so, all the kernels for reshaping the tensors can be skipped
+    if(_run_vector_matrix_multiplication)
     {
-        _run_vector_matrix_multiplication = true;
-
         // Configure the matrix multiply kernel
         _mm_kernel.configure(a, b, d, alpha);
+
+        // Configure matrix addition kernel
+        if(beta != 0 && c != nullptr)
+        {
+            _ma_kernel.configure(c, d, beta);
+            _run_addition = true;
+        }
     }
     else
     {
-        _run_vector_matrix_multiplication = false;
-
-        TensorShape shape_tmp_a = a->info()->tensor_shape();
-        TensorShape shape_tmp_b = b->info()->tensor_shape();
-
-        shape_tmp_a.set(0, a->info()->dimension(0) * 4);
-        shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
-
-        switch(a->info()->data_type())
+#if defined(__arm__)
+        if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
         {
-            case DataType::F32:
+            _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
+        }
+#elif defined(__aarch64__)
+        if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
+        {
+            _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
+        }
+#endif /* defined(__arm__) || defined(__aarch64__) */
+
+#if defined(__arm__) || defined(__aarch64__)
+        if(_mm_optimised_kernel != nullptr)
+        {
+            struct CPUInfo ci = NEScheduler::get().cpu_info();
+
+            const int M = d->info()->tensor_shape().y();
+            const int N = d->info()->tensor_shape().x();
+            const int K = a->info()->tensor_shape().x();
+
+#if defined(__arm__)
+            GemmInterleaved<sgemm_8x6, float, float> gemm(&ci, M, N, K, false, false);
+#elif defined(__aarch64__)
+            GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
+#endif /* defined(__arm__) || defined(__aarch64__) */
+
+            constexpr size_t alignment = 4096;
+            _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+            _memory_group.manage(&_workspace);
+
+            // Configure matrix multiplication kernel
+            _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f);
+
+            _workspace.allocator()->allocate();
+        }
+        else
+#endif /* defined(__arm__) || defined(__aarch64__) */
+        {
+            TensorShape shape_tmp_a = a->info()->tensor_shape();
+            TensorShape shape_tmp_b = b->info()->tensor_shape();
+
+            shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+            shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
+
+            const unsigned int transpose_w = 16 / data_size_from_type(b->info()->data_type());
+            shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
+            shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
+
+            TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
+            TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), a->info()->fixed_point_position());
+
+            _tmp_a.allocator()->init(info_a);
+            _tmp_b.allocator()->init(info_b);
+
+            // Manage intermediate buffers
+            _memory_group.manage(&_tmp_a);
+            _memory_group.manage(&_tmp_b);
+
+            // Configure interleave kernel
+            _interleave_kernel.configure(a, &_tmp_a);
+
+            // Configure transpose kernel
+            _transpose_kernel.configure(b, &_tmp_b);
+
+            // Configure matrix multiplication kernel
+            _mm_kernel.configure(&_tmp_a, &_tmp_b, d, alpha);
+
+            // Allocate once the all configure methods have been called
+            _tmp_a.allocator()->allocate();
+            _tmp_b.allocator()->allocate();
+
+            // Configure matrix addition kernel
+            if(beta != 0 && c != nullptr)
             {
-                shape_tmp_b.set(0, b->info()->dimension(1) * 4);
-                shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 4.0f));
-                break;
-            }
-            case DataType::F16:
-#ifdef ARM_COMPUTE_ENABLE_FP16
-                {
-                    shape_tmp_b.set(0, b->info()->dimension(1) * 8);
-                    shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 8.0f));
-                    break;
-                }
-#endif
-            case DataType::QS8:
-            {
-                shape_tmp_b.set(0, b->info()->dimension(1) * 16);
-                shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.0f));
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR_ON("Data type not supported");
+                _ma_kernel.configure(c, d, beta);
+                _run_addition = true;
             }
         }
-
-        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
-        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), a->info()->fixed_point_position());
-
-        _tmp_a.allocator()->init(info_a);
-        _tmp_b.allocator()->init(info_b);
-
-        // Configure interleave kernel
-        _interleave_kernel.configure(a, &_tmp_a);
-
-        // Configure transpose kernel
-        _transpose_kernel.configure(b, &_tmp_b);
-
-        // Configure matrix multiplication kernel
-        _mm_kernel.configure(&_tmp_a, &_tmp_b, d, alpha);
-
-        // Allocate once the all configure methods have been called
-        _tmp_a.allocator()->allocate();
-        _tmp_b.allocator()->allocate();
-    }
-
-    // Configure matrix addition kernel
-    if(beta != 0 && c != nullptr)
-    {
-        _ma_kernel.configure(c, d, beta);
-        _run_addition = true;
     }
 }
 
 void NEGEMM::run()
 {
-    if(!_run_vector_matrix_multiplication)
-    {
-        // Run interleave kernel
-        NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
+    _memory_group.acquire();
 
-        // Run transpose kernel
-        NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+    if(_mm_optimised_kernel != nullptr)
+    {
+        NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
+        _memory_group.release();
     }
-
-    // Run matrix multiply kernel
-    NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
-
-    // Run matrix addition kernel
-    if(_run_addition)
+    else
     {
-        NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
+        if(!_run_vector_matrix_multiplication)
+        {
+            // Run interleave kernel
+            NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
+
+            // Run transpose kernel
+            NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+        }
+
+        NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
+
+        _memory_group.release();
+
+        // Run matrix addition kernel
+        if(_run_addition)
+        {
+            NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
+        }
     }
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
index 4c77c88..63f330b 100644
--- a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
+++ b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
@@ -23,14 +23,14 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 void NEGEMMInterleave4x4::configure(const ITensor *input, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEGEMMLowp.cpp b/src/runtime/NEON/functions/NEGEMMLowp.cpp
index b64f769..7413b28 100644
--- a/src/runtime/NEON/functions/NEGEMMLowp.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowp.cpp
@@ -34,8 +34,8 @@
 
 using namespace arm_compute;
 
-NEGEMMLowp::NEGEMMLowp()
-    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
+NEGEMMLowp::NEGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
 {
 }
 
@@ -63,6 +63,10 @@
     _tmp_a.allocator()->init(info_a);
     _tmp_b.allocator()->init(info_b);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_tmp_a);
+    _memory_group.manage(&_tmp_b);
+
     _interleave_kernel.configure(a, &_tmp_a);
     _transpose_kernel.configure(b, &_tmp_b);
     _mm_kernel.configure(&_tmp_a, &_tmp_b, output, a_offset, b_offset, output_offset, output_mult_int, shift);
@@ -73,6 +77,8 @@
 
 void NEGEMMLowp::run()
 {
+    _memory_group.acquire();
+
     /* Run interleave kernel */
     NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
 
@@ -81,4 +87,6 @@
 
     /* Run matrix multiply kernel */
     NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
index dc40ece..571bf2b 100644
--- a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
+++ b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
@@ -24,17 +24,17 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 void NEGEMMTranspose1xW::configure(const ITensor *input, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEGaussian3x3.cpp b/src/runtime/NEON/functions/NEGaussian3x3.cpp
index 95ba5cb..db8eb63 100644
--- a/src/runtime/NEON/functions/NEGaussian3x3.cpp
+++ b/src/runtime/NEON/functions/NEGaussian3x3.cpp
@@ -23,9 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGaussian3x3.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void NEGaussian3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<NEGaussian3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEGaussian3x3Kernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/NEON/functions/NEGaussian5x5.cpp b/src/runtime/NEON/functions/NEGaussian5x5.cpp
index 5ccc765..b010ca0 100644
--- a/src/runtime/NEON/functions/NEGaussian5x5.cpp
+++ b/src/runtime/NEON/functions/NEGaussian5x5.cpp
@@ -32,17 +32,20 @@
 
 using namespace arm_compute;
 
-NEGaussian5x5::NEGaussian5x5()
-    : _kernel_hor(), _kernel_vert(), _tmp(), _border_handler()
+NEGaussian5x5::NEGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _kernel_hor(), _kernel_vert(), _tmp(), _border_handler()
 {
 }
 
 void NEGaussian5x5::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
     // Init temporary buffer
-    TensorInfo tensor_info(input->info()->tensor_shape(), Format::S16);
+    TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::S16);
     _tmp.allocator()->init(tensor_info);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_tmp);
+
     // Create and configure kernels for the two passes
     _kernel_hor.configure(input, &_tmp, border_mode == BorderMode::UNDEFINED);
     _kernel_vert.configure(&_tmp, output, border_mode == BorderMode::UNDEFINED);
@@ -54,7 +57,12 @@
 
 void NEGaussian5x5::run()
 {
-    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+
+    _memory_group.acquire();
+
     NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
     NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
index e1d64f1..84ea0ca 100644
--- a/src/runtime/NEON/functions/NEGaussianPyramid.cpp
+++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
@@ -24,7 +24,6 @@
 #include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
 #include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
@@ -36,6 +35,7 @@
 #include "arm_compute/runtime/Pyramid.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
 
 #include <cstddef>
 
@@ -46,8 +46,10 @@
 {
 }
 
-NEGaussianPyramidHalf::NEGaussianPyramidHalf()
-    : _border_handler(), _horizontal_reduction(), _vertical_reduction()
+NEGaussianPyramidHalf::NEGaussianPyramidHalf() // NOLINT
+    : _border_handler(),
+      _horizontal_reduction(),
+      _vertical_reduction()
 {
 }
 
@@ -68,9 +70,9 @@
 
     if(num_levels > 1)
     {
-        _border_handler       = arm_compute::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
-        _horizontal_reduction = arm_compute::cpp14::make_unique<NEGaussianPyramidHorKernel[]>(num_levels - 1);
-        _vertical_reduction   = arm_compute::cpp14::make_unique<NEGaussianPyramidVertKernel[]>(num_levels - 1);
+        _border_handler       = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
+        _horizontal_reduction = arm_compute::support::cpp14::make_unique<NEGaussianPyramidHorKernel[]>(num_levels - 1);
+        _vertical_reduction   = arm_compute::support::cpp14::make_unique<NEGaussianPyramidVertKernel[]>(num_levels - 1);
 
         // Apply half scale to the X dimension of the tensor shape
         TensorShape tensor_shape = pyramid->info()->tensor_shape();
@@ -107,14 +109,15 @@
 
     for(unsigned int i = 0; i < num_levels - 1; ++i)
     {
-        _border_handler[i].run(_border_handler[i].window());
+        NEScheduler::get().schedule(_border_handler.get() + i, Window::DimZ);
         NEScheduler::get().schedule(_horizontal_reduction.get() + i, Window::DimY);
         NEScheduler::get().schedule(_vertical_reduction.get() + i, Window::DimY);
     }
 }
 
-NEGaussianPyramidOrb::NEGaussianPyramidOrb()
-    : _offsets(), _gaus5x5(), _scale_nearest()
+NEGaussianPyramidOrb::NEGaussianPyramidOrb() // NOLINT
+    : _gaus5x5(),
+      _scale_nearest()
 {
 }
 
@@ -135,30 +138,19 @@
 
     if(num_levels > 1)
     {
-        _gaus5x5       = arm_compute::cpp14::make_unique<NEGaussian5x5[]>(num_levels - 1);
-        _scale_nearest = arm_compute::cpp14::make_unique<NEScaleKernel[]>(num_levels - 1);
-        _offsets       = arm_compute::cpp14::make_unique<Image[]>(num_levels - 1);
+        _gaus5x5       = arm_compute::support::cpp14::make_unique<NEGaussian5x5[]>(num_levels - 1);
+        _scale_nearest = arm_compute::support::cpp14::make_unique<NEScale[]>(num_levels - 1);
 
         PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
         _tmp.init(pyramid_info);
 
         for(unsigned int i = 0; i < num_levels - 1; ++i)
         {
-            const size_t width  = _pyramid->get_pyramid_level(i + 1)->info()->dimension(0);
-            const size_t height = _pyramid->get_pyramid_level(i + 1)->info()->dimension(1);
-
-            /* Allocate Image for the offsets used by NEAREST interpolation */
-            TensorInfo tensor_info(TensorShape(width, height), Format::S32);
-            _offsets[i].allocator()->init(tensor_info);
-
             /* Configure gaussian 5x5 */
             _gaus5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
 
-            /* Configure scale image kernel */
-            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), nullptr, nullptr, _offsets.get() + i, _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR,
-                                        border_mode == BorderMode::UNDEFINED);
-
-            _offsets[i].allocator()->allocate();
+            /* Configure scale */
+            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED);
         }
 
         _tmp.allocate();
@@ -178,6 +170,6 @@
     for(unsigned int i = 0; i < num_levels - 1; ++i)
     {
         _gaus5x5[i].run();
-        NEScheduler::get().schedule(_scale_nearest.get() + i, Window::DimY);
+        _scale_nearest[i].run();
     }
 }
diff --git a/src/runtime/NEON/functions/NEHOGDescriptor.cpp b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
index a592f53..5e98269 100644
--- a/src/runtime/NEON/functions/NEHOGDescriptor.cpp
+++ b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
@@ -31,8 +31,8 @@
 
 using namespace arm_compute;
 
-NEHOGDescriptor::NEHOGDescriptor()
-    : _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
+NEHOGDescriptor::NEHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
 {
 }
 
@@ -71,9 +71,16 @@
     TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
     _hog_space.allocator()->init(info_space);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_mag);
+    _memory_group.manage(&_phase);
+
     // Initialise gradient kernel
     _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_hog_space);
+
     // Initialise orientation binning kernel
     _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
 
@@ -88,6 +95,8 @@
 
 void NEHOGDescriptor::run()
 {
+    _memory_group.acquire();
+
     // Run gradient
     _gradient.run();
 
@@ -96,4 +105,6 @@
 
     // Run block normalization kernel
     NEScheduler::get().schedule(&_block_norm, Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEHOGDetector.cpp b/src/runtime/NEON/functions/NEHOGDetector.cpp
index e8ed29d..49d0778 100644
--- a/src/runtime/NEON/functions/NEHOGDetector.cpp
+++ b/src/runtime/NEON/functions/NEHOGDetector.cpp
@@ -23,14 +23,14 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEHOGDetector.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 void NEHOGDetector::configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
 {
-    auto k = arm_compute::cpp14::make_unique<NEHOGDetectorKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEHOGDetectorKernel>();
     k->configure(input, hog, detection_windows, detection_window_stride, threshold, idx_class);
     _kernel = std::move(k);
 }
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEHOGGradient.cpp b/src/runtime/NEON/functions/NEHOGGradient.cpp
index 2f4b880..efc8690 100644
--- a/src/runtime/NEON/functions/NEHOGGradient.cpp
+++ b/src/runtime/NEON/functions/NEHOGGradient.cpp
@@ -23,15 +23,19 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEHOGGradient.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-NEHOGGradient::NEHOGGradient()
-    : _derivative(), _mag_phase(nullptr), _gx(), _gy()
+NEHOGGradient::NEHOGGradient(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _derivative(),
+      _mag_phase(nullptr),
+      _gx(),
+      _gy()
 {
 }
 
@@ -48,19 +52,23 @@
     _gx.allocator()->init(info);
     _gy.allocator()->init(info);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_gx);
+    _memory_group.manage(&_gy);
+
     // Initialise derivate kernel
     _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
 
     // Initialise magnitude/phase kernel
     if(PhaseType::UNSIGNED == phase_type)
     {
-        auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>>();
+        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>>();
         k->configure(&_gx, &_gy, output_magnitude, output_phase);
         _mag_phase = std::move(k);
     }
     else
     {
-        auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
         k->configure(&_gx, &_gy, output_magnitude, output_phase);
         _mag_phase = std::move(k);
     }
@@ -72,9 +80,13 @@
 
 void NEHOGGradient::run()
 {
+    _memory_group.acquire();
+
     // Run derivative
     _derivative.run();
 
     // Run magnitude/phase kernel
     NEScheduler::get().schedule(_mag_phase.get(), Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
index 173b8f4..8c834e2 100644
--- a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
+++ b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
@@ -24,16 +24,30 @@
 #include "arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-NEHOGMultiDetection::NEHOGMultiDetection()
-    : _gradient_kernel(), _orient_bin_kernel(), _block_norm_kernel(), _hog_detect_kernel(), _non_maxima_kernel(), _hog_space(), _hog_norm_space(), _detection_windows(), _mag(), _phase(),
-      _non_maxima_suppression(false), _num_orient_bin_kernel(0), _num_block_norm_kernel(0), _num_hog_detect_kernel(0)
+NEHOGMultiDetection::NEHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _gradient_kernel(),
+      _orient_bin_kernel(),
+      _block_norm_kernel(),
+      _hog_detect_kernel(),
+      _non_maxima_kernel(),
+      _hog_space(),
+      _hog_norm_space(),
+      _detection_windows(),
+      _mag(),
+      _phase(),
+      _non_maxima_suppression(false),
+      _num_orient_bin_kernel(0),
+      _num_block_norm_kernel(0),
+      _num_hog_detect_kernel(0)
 {
 }
 
@@ -112,12 +126,12 @@
     _num_block_norm_kernel  = input_block_norm.size(); // Number of NEHOGBlockNormalizationKernel kernels to compute
     _num_hog_detect_kernel  = input_hog_detect.size(); // Number of NEHOGDetector functions to compute
 
-    _orient_bin_kernel = arm_compute::cpp14::make_unique<NEHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
-    _block_norm_kernel = arm_compute::cpp14::make_unique<NEHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
-    _hog_detect_kernel = arm_compute::cpp14::make_unique<NEHOGDetector[]>(_num_hog_detect_kernel);
-    _non_maxima_kernel = arm_compute::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
-    _hog_space         = arm_compute::cpp14::make_unique<Tensor[]>(_num_orient_bin_kernel);
-    _hog_norm_space    = arm_compute::cpp14::make_unique<Tensor[]>(_num_block_norm_kernel);
+    _orient_bin_kernel = arm_compute::support::cpp14::make_unique<NEHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
+    _block_norm_kernel = arm_compute::support::cpp14::make_unique<NEHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
+    _hog_detect_kernel = arm_compute::support::cpp14::make_unique<NEHOGDetector[]>(_num_hog_detect_kernel);
+    _non_maxima_kernel = arm_compute::support::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
+    _hog_space         = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_orient_bin_kernel);
+    _hog_norm_space    = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_block_norm_kernel);
 
     // Allocate tensors for magnitude and phase
     TensorInfo info_mag(shape_img, Format::S16);
@@ -126,6 +140,10 @@
     TensorInfo info_phase(shape_img, Format::U8);
     _phase.allocator()->init(info_phase);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_mag);
+    _memory_group.manage(&_phase);
+
     // Initialise gradient kernel
     _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
 
@@ -151,10 +169,17 @@
         TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
         _hog_space[i].allocator()->init(info_space);
 
+        // Manage intermediate buffers
+        _memory_group.manage(_hog_space.get() + i);
+
         // Initialise orientation binning kernel
         _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
     }
 
+    // Allocate intermediate tensors
+    _mag.allocator()->allocate();
+    _phase.allocator()->allocate();
+
     // Configure NETensor for the normalized HOG space and block normalization kernel
     for(size_t i = 0; i < _num_block_norm_kernel; ++i)
     {
@@ -165,10 +190,19 @@
         TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
         _hog_norm_space[i].allocator()->init(tensor_info);
 
+        // Manage intermediate buffers
+        _memory_group.manage(_hog_norm_space.get() + i);
+
         // Initialize block normalization kernel
         _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
     }
 
+    // Allocate intermediate tensors
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        _hog_space[i].allocator()->allocate();
+    }
+
     // Configure HOG detector kernel
     for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
     {
@@ -181,14 +215,6 @@
     _non_maxima_kernel->configure(_detection_windows, min_distance);
 
     // Allocate intermediate tensors
-    _mag.allocator()->allocate();
-    _phase.allocator()->allocate();
-
-    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
-    {
-        _hog_space[i].allocator()->allocate();
-    }
-
     for(size_t i = 0; i < _num_block_norm_kernel; ++i)
     {
         _hog_norm_space[i].allocator()->allocate();
@@ -199,6 +225,8 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
 
+    _memory_group.acquire();
+
     // Reset detection window
     _detection_windows->clear();
 
@@ -226,6 +254,8 @@
     // Run non-maxima suppression kernel if enabled
     if(_non_maxima_suppression)
     {
-        _non_maxima_kernel->run(_non_maxima_kernel->window());
+        NEScheduler::get().schedule(_non_maxima_kernel.get(), Window::DimY);
     }
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
index b54fb67..25e28d2 100644
--- a/src/runtime/NEON/functions/NEHarrisCorners.cpp
+++ b/src/runtime/NEON/functions/NEHarrisCorners.cpp
@@ -24,7 +24,6 @@
 #include "arm_compute/runtime/NEON/functions/NEHarrisCorners.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -35,14 +34,28 @@
 #include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
 #include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
 
 #include <cmath>
 #include <utility>
 
 using namespace arm_compute;
 
-NEHarrisCorners::NEHarrisCorners()
-    : _sobel(), _harris_score(), _non_max_suppr(), _candidates(), _sort_euclidean(), _border_gx(), _border_gy(), _gx(), _gy(), _score(), _nonmax(), _corners_list(), _num_corner_candidates(0)
+NEHarrisCorners::NEHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _sobel(),
+      _harris_score(),
+      _non_max_suppr(),
+      _candidates(),
+      _sort_euclidean(),
+      _border_gx(),
+      _border_gy(),
+      _gx(),
+      _gy(),
+      _score(),
+      _nonmax(),
+      _corners_list(),
+      _num_corner_candidates(0)
 {
 }
 
@@ -69,32 +82,36 @@
     _gx.allocator()->init(tensor_info_gxgy);
     _gy.allocator()->init(tensor_info_gxgy);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_gx);
+    _memory_group.manage(&_gy);
+
     TensorInfo tensor_info_score(shape, Format::F32);
     _score.allocator()->init(tensor_info_score);
     _nonmax.allocator()->init(tensor_info_score);
 
-    _corners_list = arm_compute::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+    _corners_list = arm_compute::support::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
 
     // Set/init Sobel kernel accordingly with gradient_size
     switch(gradient_size)
     {
         case 3:
         {
-            auto k = arm_compute::cpp14::make_unique<NESobel3x3>();
+            auto k = arm_compute::support::cpp14::make_unique<NESobel3x3>();
             k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
             _sobel = std::move(k);
             break;
         }
         case 5:
         {
-            auto k = arm_compute::cpp14::make_unique<NESobel5x5>();
+            auto k = arm_compute::support::cpp14::make_unique<NESobel5x5>();
             k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
             _sobel = std::move(k);
             break;
         }
         case 7:
         {
-            auto k = arm_compute::cpp14::make_unique<NESobel7x7>();
+            auto k = arm_compute::support::cpp14::make_unique<NESobel7x7>();
             k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
             _sobel = std::move(k);
             break;
@@ -106,27 +123,30 @@
     // Normalization factor
     const float norm_factor = 1.0f / (255.0f * pow(4.0f, gradient_size / 2) * block_size);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_score);
+
     if(use_fp16)
     {
         switch(block_size)
         {
             case 3:
             {
-                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreFP16Kernel<3>>();
+                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreFP16Kernel<3>>();
                 k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
                 _harris_score = std::move(k);
             }
             break;
             case 5:
             {
-                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreFP16Kernel<5>>();
+                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreFP16Kernel<5>>();
                 k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
                 _harris_score = std::move(k);
             }
             break;
             case 7:
             {
-                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreFP16Kernel<7>>();
+                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreFP16Kernel<7>>();
                 k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
                 _harris_score = std::move(k);
             }
@@ -141,21 +161,21 @@
         {
             case 3:
             {
-                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreKernel<3>>();
+                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<3>>();
                 k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
                 _harris_score = std::move(k);
             }
             break;
             case 5:
             {
-                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreKernel<5>>();
+                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<5>>();
                 k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
                 _harris_score = std::move(k);
             }
             break;
             case 7:
             {
-                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreKernel<7>>();
+                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<7>>();
                 k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
                 _harris_score = std::move(k);
             }
@@ -168,26 +188,35 @@
     _border_gx.configure(&_gx, _harris_score->border_size(), border_mode, constant_border_value);
     _border_gy.configure(&_gy, _harris_score->border_size(), border_mode, constant_border_value);
 
+    // Allocate once all the configure methods have been called
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+
+    // Manage intermediate buffers
+    _memory_group.manage(&_nonmax);
+
     // Init non-maxima suppression function
     _non_max_suppr.configure(&_score, &_nonmax, border_mode);
 
+    // Allocate once all the configure methods have been called
+    _score.allocator()->allocate();
+
     // Init corner candidates kernel
     _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
 
+    // Allocate once all the configure methods have been called
+    _nonmax.allocator()->allocate();
+
     // Init euclidean distance
     _sort_euclidean.configure(_corners_list.get(), corners, &_num_corner_candidates, min_dist);
-
-    // Allocate once all the configure methods have been called
-    _gx.allocator()->allocate();
-    _gy.allocator()->allocate();
-    _score.allocator()->allocate();
-    _nonmax.allocator()->allocate();
 }
 
 void NEHarrisCorners::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
 
+    _memory_group.acquire();
+
     // Init to 0 number of corner candidates
     _num_corner_candidates = 0;
 
@@ -195,8 +224,8 @@
     _sobel->run();
 
     // Fill border before harris score kernel
-    _border_gx.run(_border_gx.window());
-    _border_gy.run(_border_gy.window());
+    NEScheduler::get().schedule(&_border_gx, Window::DimZ);
+    NEScheduler::get().schedule(&_border_gy, Window::DimZ);
 
     // Run harris score kernel
     NEScheduler::get().schedule(_harris_score.get(), Window::DimY);
@@ -208,5 +237,7 @@
     NEScheduler::get().schedule(&_candidates, Window::DimY);
 
     // Run sort & euclidean distance
-    _sort_euclidean.run(_sort_euclidean.window());
+    NEScheduler::get().schedule(&_sort_euclidean, Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEHistogram.cpp b/src/runtime/NEON/functions/NEHistogram.cpp
index c42b2a5..f333ecb 100644
--- a/src/runtime/NEON/functions/NEHistogram.cpp
+++ b/src/runtime/NEON/functions/NEHistogram.cpp
@@ -24,17 +24,17 @@
 #include "arm_compute/runtime/NEON/functions/NEHistogram.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IDistribution1D.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 NEHistogram::NEHistogram()
-    : _histogram_kernel(), _local_hist(), _window_lut(arm_compute::cpp14::make_unique<uint32_t[]>(window_lut_default_size)), _local_hist_size(0)
+    : _histogram_kernel(), _local_hist(), _window_lut(arm_compute::support::cpp14::make_unique<uint32_t[]>(window_lut_default_size)), _local_hist_size(0)
 {
 }
 
@@ -45,7 +45,7 @@
 
     // Allocate space for threads local histograms
     _local_hist_size = output->num_bins() * NEScheduler::get().num_threads();
-    _local_hist      = arm_compute::cpp14::make_unique<uint32_t[]>(_local_hist_size);
+    _local_hist      = arm_compute::support::cpp14::make_unique<uint32_t[]>(_local_hist_size);
 
     // Configure kernel
     _histogram_kernel.configure(input, output, _local_hist.get(), _window_lut.get());
diff --git a/src/runtime/NEON/functions/NEIntegralImage.cpp b/src/runtime/NEON/functions/NEIntegralImage.cpp
index af604e9..2e94ed5 100644
--- a/src/runtime/NEON/functions/NEIntegralImage.cpp
+++ b/src/runtime/NEON/functions/NEIntegralImage.cpp
@@ -23,9 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEIntegralImage.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h"
 #include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void NEIntegralImage::configure(const ITensor *input, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEIntegralImageKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEIntegralImageKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
     _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, 0);
diff --git a/src/runtime/NEON/functions/NEL2Normalize.cpp b/src/runtime/NEON/functions/NEL2Normalize.cpp
new file mode 100644
index 0000000..349a781
--- /dev/null
+++ b/src/runtime/NEON/functions/NEL2Normalize.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEL2Normalize.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEL2Normalize::NEL2Normalize(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
+{
+}
+
+void NEL2Normalize::configure(ITensor *input, ITensor *output, unsigned int axis, float epsilon)
+{
+    // Manage intermediate buffers
+    _memory_group.manage(&_sumsq);
+
+    // Configure Kernels
+    _reduce_func.configure(input, &_sumsq, axis, ReductionOperation::SUM_SQUARE);
+    _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon);
+
+    // Allocate intermediate tensors
+    _sumsq.allocator()->allocate();
+}
+
+void NEL2Normalize::run()
+{
+    _memory_group.acquire();
+
+    _reduce_func.run();
+    NEScheduler::get().schedule(&_normalize_kernel, Window::DimY);
+
+    _memory_group.release();
+}
diff --git a/src/runtime/NEON/functions/NELaplacianPyramid.cpp b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
index 8232c79..a680f1f 100644
--- a/src/runtime/NEON/functions/NELaplacianPyramid.cpp
+++ b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
@@ -24,7 +24,6 @@
 #include "arm_compute/runtime/NEON/functions/NELaplacianPyramid.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IPyramid.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
@@ -33,11 +32,18 @@
 #include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
 #include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
 #include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-NELaplacianPyramid::NELaplacianPyramid()
-    : _num_levels(0), _gaussian_pyr_function(), _convf(), _subf(), _gauss_pyr(), _conv_pyr(), _depth_function()
+NELaplacianPyramid::NELaplacianPyramid() // NOLINT
+    : _num_levels(0),
+      _gaussian_pyr_function(),
+      _convf(),
+      _subf(),
+      _gauss_pyr(),
+      _conv_pyr(),
+      _depth_function()
 {
 }
 
@@ -86,8 +92,8 @@
     // Create Gaussian Pyramid function
     _gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
 
-    _convf = arm_compute::cpp14::make_unique<NEGaussian5x5[]>(_num_levels);
-    _subf  = arm_compute::cpp14::make_unique<NEArithmeticSubtraction[]>(_num_levels);
+    _convf = arm_compute::support::cpp14::make_unique<NEGaussian5x5[]>(_num_levels);
+    _subf  = arm_compute::support::cpp14::make_unique<NEArithmeticSubtraction[]>(_num_levels);
 
     for(unsigned int i = 0; i < _num_levels; ++i)
     {
diff --git a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
index 36ac4a7..0893701 100644
--- a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
+++ b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
@@ -24,18 +24,21 @@
 #include "arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IPyramid.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
 
 #include <cstddef>
 
 using namespace arm_compute;
 
-NELaplacianReconstruct::NELaplacianReconstruct()
-    : _tmp_pyr(), _addf(), _scalef(), _depthf()
+NELaplacianReconstruct::NELaplacianReconstruct() // NOLINT
+    : _tmp_pyr(),
+      _addf(),
+      _scalef(),
+      _depthf()
 {
 }
 
@@ -61,8 +64,8 @@
     _tmp_pyr.init(pyramid_info);
 
     // Allocate add and scale functions. Level 0 does not need to be scaled.
-    _addf   = arm_compute::cpp14::make_unique<NEArithmeticAddition[]>(num_levels);
-    _scalef = arm_compute::cpp14::make_unique<NEScale[]>(num_levels - 1);
+    _addf   = arm_compute::support::cpp14::make_unique<NEArithmeticAddition[]>(num_levels);
+    _scalef = arm_compute::support::cpp14::make_unique<NEScale[]>(num_levels - 1);
 
     const size_t last_level = num_levels - 1;
 
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index 85d7ba3..cb48598 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -33,8 +33,9 @@
 
 using namespace arm_compute;
 
-NELocallyConnectedLayer::NELocallyConnectedLayer()
-    : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false)
+NELocallyConnectedLayer::NELocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
+      _is_first_run(false)
 {
 }
 
@@ -65,11 +66,14 @@
     std::tie(stride_x, stride_y) = conv_info.stride();
     std::tie(pad_x, pad_y)       = conv_info.pad();
 
+    const unsigned int kernel_width  = weights->info()->dimension(0);
+    const unsigned int kernel_height = weights->info()->dimension(1);
+
     // Get convolved dimensions
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
-                                                 stride_x, stride_y, pad_x, pad_y, conv_info.round());
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
+                                                 conv_info);
 
     ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
     ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
@@ -99,8 +103,12 @@
     shape_gemm.set(1, mat_input_rows);
     _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_input_im2col_reshaped);
+    _memory_group.manage(&_gemm_output);
+
     // Configure kernels
-    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
     _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
     _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
     _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
@@ -120,6 +128,8 @@
         NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
     }
 
+    _memory_group.acquire();
+
     // Run input reshaping
     NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
 
@@ -128,4 +138,6 @@
 
     // Reshape output matrix
     NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEMagnitude.cpp b/src/runtime/NEON/functions/NEMagnitude.cpp
index 9390ca2..7877995 100644
--- a/src/runtime/NEON/functions/NEMagnitude.cpp
+++ b/src/runtime/NEON/functions/NEMagnitude.cpp
@@ -23,9 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEMagnitude.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
 #include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -35,13 +35,13 @@
 {
     if(use_fp16)
     {
-        auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
         k->configure(input1, input2, output, nullptr);
         _kernel = std::move(k);
     }
     else
     {
-        auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
         k->configure(input1, input2, output, nullptr);
         _kernel = std::move(k);
     }
diff --git a/src/runtime/NEON/functions/NEMeanStdDev.cpp b/src/runtime/NEON/functions/NEMeanStdDev.cpp
index 47143f5..2304bc8 100644
--- a/src/runtime/NEON/functions/NEMeanStdDev.cpp
+++ b/src/runtime/NEON/functions/NEMeanStdDev.cpp
@@ -23,19 +23,19 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDev.h"
 
-#include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 using namespace arm_compute;
 
 NEMeanStdDev::NEMeanStdDev()
-    : _mean_stddev_kernel(), _global_sum(0), _global_sum_squared(0)
+    : _mean_stddev_kernel(), _fill_border_kernel(), _global_sum(0), _global_sum_squared(0)
 {
 }
 
-void NEMeanStdDev::configure(const IImage *input, float *mean, float *stddev)
+void NEMeanStdDev::configure(IImage *input, float *mean, float *stddev)
 {
     _mean_stddev_kernel.configure(input, mean, &_global_sum, stddev, &_global_sum_squared);
+    _fill_border_kernel.configure(input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
 }
 
 void NEMeanStdDev::run()
@@ -43,5 +43,6 @@
     _global_sum         = 0;
     _global_sum_squared = 0;
 
+    NEScheduler::get().schedule(&_fill_border_kernel, Window::DimZ);
     NEScheduler::get().schedule(&_mean_stddev_kernel, Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NEMedian3x3.cpp b/src/runtime/NEON/functions/NEMedian3x3.cpp
index aa7cc97..627e5fb 100644
--- a/src/runtime/NEON/functions/NEMedian3x3.cpp
+++ b/src/runtime/NEON/functions/NEMedian3x3.cpp
@@ -23,9 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEMedian3x3.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void NEMedian3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<NEMedian3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEMedian3x3Kernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/NEON/functions/NEMinMaxLocation.cpp b/src/runtime/NEON/functions/NEMinMaxLocation.cpp
index cab9200..54e89ab 100644
--- a/src/runtime/NEON/functions/NEMinMaxLocation.cpp
+++ b/src/runtime/NEON/functions/NEMinMaxLocation.cpp
@@ -32,7 +32,7 @@
 {
 }
 
-void NEMinMaxLocation::configure(const IImage *input, int32_t *min, int32_t *max, ICoordinates2DArray *min_loc, ICoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
+void NEMinMaxLocation::configure(const IImage *input, void *min, void *max, ICoordinates2DArray *min_loc, ICoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
 {
     _min_max.configure(input, min, max);
     _min_max_loc.configure(input, min, max, min_loc, max_loc, min_count, max_count);
diff --git a/src/runtime/NEON/functions/NENonLinearFilter.cpp b/src/runtime/NEON/functions/NENonLinearFilter.cpp
index 01aea3b..57bd4e7 100644
--- a/src/runtime/NEON/functions/NENonLinearFilter.cpp
+++ b/src/runtime/NEON/functions/NENonLinearFilter.cpp
@@ -23,9 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NENonLinearFilter.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -35,7 +35,7 @@
                                   BorderMode border_mode,
                                   uint8_t    constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<NENonLinearFilterKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NENonLinearFilterKernel>();
     k->configure(input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
index a7b3759..3b59820 100644
--- a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
+++ b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,16 +32,16 @@
 
 void NENonMaximaSuppression3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode)
 {
-    auto k = arm_compute::cpp14::make_unique<NENonMaximaSuppression3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NENonMaximaSuppression3x3Kernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
 
     if(border_mode != BorderMode::UNDEFINED)
     {
-        _border_handler.configure(input, 1, BorderMode::CONSTANT, 0);
+        _border_handler.configure(input, BorderSize(1), BorderMode::CONSTANT, 0);
     }
     else
     {
-        _border_handler.configure(input, 1, BorderMode::UNDEFINED, 0);
+        _border_handler.configure(input, BorderSize(1), BorderMode::UNDEFINED, 0);
     }
 }
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index 69ff325..e01ef66 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -32,8 +32,8 @@
 
 using namespace arm_compute;
 
-NENormalizationLayer::NENormalizationLayer()
-    : _norm_kernel(), _multiply_kernel(), _border_handler(), _input_squared()
+NENormalizationLayer::NENormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_kernel(), _border_handler(), _input_squared()
 {
 }
 
@@ -44,6 +44,9 @@
     TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
     _input_squared.allocator()->init(tensor_info);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_input_squared);
+
     // Configure kernels
     _norm_kernel.configure(input, &_input_squared, output, norm_info);
     _multiply_kernel.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
@@ -55,7 +58,11 @@
 
 void NENormalizationLayer::run()
 {
+    _memory_group.acquire();
+
     NEScheduler::get().schedule(&_multiply_kernel, Window::DimY);
     NEScheduler::get().schedule(&_border_handler, Window::DimY);
     NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEOpticalFlow.cpp b/src/runtime/NEON/functions/NEOpticalFlow.cpp
index 49135e4..e90d8f6 100644
--- a/src/runtime/NEON/functions/NEOpticalFlow.cpp
+++ b/src/runtime/NEON/functions/NEOpticalFlow.cpp
@@ -24,7 +24,6 @@
 #include "arm_compute/runtime/NEON/functions/NEOpticalFlow.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -34,11 +33,21 @@
 #include "arm_compute/runtime/Pyramid.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-NEOpticalFlow::NEOpticalFlow()
-    : _func_scharr(), _kernel_tracker(), _scharr_gx(), _scharr_gy(), _new_points(nullptr), _new_points_estimates(nullptr), _old_points(nullptr), _new_points_internal(), _old_points_internal(),
+NEOpticalFlow::NEOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _func_scharr(),
+      _kernel_tracker(),
+      _scharr_gx(),
+      _scharr_gy(),
+      _new_points(nullptr),
+      _new_points_estimates(nullptr),
+      _old_points(nullptr),
+      _new_points_internal(),
+      _old_points_internal(),
       _num_levels(0)
 {
 }
@@ -65,10 +74,10 @@
 
     const float pyr_scale = old_pyramid->info()->scale();
 
-    _func_scharr    = arm_compute::cpp14::make_unique<NEScharr3x3[]>(_num_levels);
-    _kernel_tracker = arm_compute::cpp14::make_unique<NELKTrackerKernel[]>(_num_levels);
-    _scharr_gx      = arm_compute::cpp14::make_unique<Tensor[]>(_num_levels);
-    _scharr_gy      = arm_compute::cpp14::make_unique<Tensor[]>(_num_levels);
+    _func_scharr    = arm_compute::support::cpp14::make_unique<NEScharr3x3[]>(_num_levels);
+    _kernel_tracker = arm_compute::support::cpp14::make_unique<NELKTrackerKernel[]>(_num_levels);
+    _scharr_gx      = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_levels);
+    _scharr_gy      = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_levels);
 
     _old_points_internal = LKInternalKeypointArray(old_points->num_values());
     _new_points_internal = LKInternalKeypointArray(old_points->num_values());
@@ -89,6 +98,10 @@
         _scharr_gx[i].allocator()->init(tensor_info);
         _scharr_gy[i].allocator()->init(tensor_info);
 
+        // Manage intermediate buffers
+        _memory_group.manage(_scharr_gx.get() + i);
+        _memory_group.manage(_scharr_gy.get() + i);
+
         // Init Scharr kernel
         _func_scharr[i].configure(old_ith_input, _scharr_gx.get() + i, _scharr_gy.get() + i, border_mode, constant_border_value);
 
@@ -108,6 +121,8 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
 
+    _memory_group.acquire();
+
     for(unsigned int level = _num_levels; level > 0; --level)
     {
         // Run Scharr kernel
@@ -116,4 +131,6 @@
         // Run Lucas-Kanade kernel
         NEScheduler::get().schedule(_kernel_tracker.get() + level - 1, Window::DimX);
     }
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEPhase.cpp b/src/runtime/NEON/functions/NEPhase.cpp
index 7683f46..436d22f 100644
--- a/src/runtime/NEON/functions/NEPhase.cpp
+++ b/src/runtime/NEON/functions/NEPhase.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEPhase.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NEPhase::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+    auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
     k->configure(input1, input2, nullptr, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index 056d33b..2e2ea11 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
-    auto k = arm_compute::cpp14::make_unique<NEPixelWiseMultiplicationKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEPixelWiseMultiplicationKernel>();
     k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 6f0cc4f..4c4e11f 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -23,15 +23,15 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
 {
     // Configure pooling kernel
-    auto k = arm_compute::cpp14::make_unique<NEPoolingLayerKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEPoolingLayerKernel>();
     k->configure(input, output, pool_info);
     _kernel = std::move(k);
 
diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
new file mode 100644
index 0000000..a131c48
--- /dev/null
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEQuantizationLayer::NEQuantizationLayer()
+    : _quantize_kernel(), _min_max_kernel(), _min_max()
+{
+}
+
+void NEQuantizationLayer::configure(const ITensor *input, ITensor *output)
+{
+    // Configure min-max kernel. _min_max tensor will be auto-configured within the kernel
+    _min_max_kernel.configure(input, &_min_max);
+
+    // Configure quantize kernel
+    _quantize_kernel.configure(input, output, &_min_max);
+
+    // Allocate min_max tensor
+    _min_max.allocator()->allocate();
+}
+
+void NEQuantizationLayer::run()
+{
+    // Reset min and max
+    _min_max_kernel.reset();
+
+    // Run min and max kernel
+    NEScheduler::get().schedule(&_min_max_kernel, Window::DimY);
+
+    // Run quantize kernel
+    NEScheduler::get().schedule(&_quantize_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
new file mode 100644
index 0000000..1f1400c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEROIPoolingLayer::NEROIPoolingLayer()
+    : _roi_kernel()
+{
+}
+
+void NEROIPoolingLayer::configure(const ITensor *input, const IROIArray *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+{
+    _roi_kernel.configure(input, rois, output, pool_info);
+}
+
+void NEROIPoolingLayer::run()
+{
+    NEScheduler::get().schedule(&_roi_kernel, Window::DimX);
+}
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
new file mode 100644
index 0000000..45c3e5d
--- /dev/null
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+namespace
+{
+/** Define dimension to split the window
+ *
+ * @param[in] axis Reduction axis
+ *
+ * @return The dimension to split the window
+ */
+size_t reduction_window_split_dimension(unsigned int axis)
+{
+    switch(axis)
+    {
+        case 0:
+            return Window::DimY;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported reduction axis");
+    }
+}
+BorderMode reduction_operation_border_mode(ReductionOperation op)
+{
+    switch(op)
+    {
+        case ReductionOperation::SUM_SQUARE:
+            return BorderMode::CONSTANT;
+        default:
+            return BorderMode::CONSTANT;
+    }
+}
+} // namespace
+
+NEReductionOperation::NEReductionOperation()
+    : _reduction_kernel(), _fill_border_kernel(), _window_split(0)
+{
+}
+
+void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+
+    // Configure reduction kernel
+    _reduction_kernel.configure(input, output, axis, op);
+    _window_split = reduction_window_split_dimension(axis);
+
+    // Configure fill border kernel
+    BorderSize fill_border_size = (axis == 0) ? _reduction_kernel.border_size() : BorderSize();
+    BorderMode fill_border_mode = reduction_operation_border_mode(op);
+    _fill_border_kernel.configure(input, fill_border_size, fill_border_mode, PixelValue(0));
+}
+
+void NEReductionOperation::run()
+{
+    NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_reduction_kernel, _window_split);
+}
diff --git a/src/runtime/NEON/functions/NERemap.cpp b/src/runtime/NEON/functions/NERemap.cpp
index 9f06fb6..882e93b 100644
--- a/src/runtime/NEON/functions/NERemap.cpp
+++ b/src/runtime/NEON/functions/NERemap.cpp
@@ -24,13 +24,13 @@
 #include "arm_compute/runtime/NEON/functions/NERemap.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/NERemapKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -44,7 +44,7 @@
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
 
-    auto k = arm_compute::cpp14::make_unique<NERemapKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NERemapKernel>();
 
     k->configure(input, map_x, map_y, output, policy);
 
diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp
new file mode 100644
index 0000000..fef4e0c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+
+#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEReshapeLayer::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEReshapeLayerKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index b70f626..bbd3fac 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -27,11 +27,12 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
 
 #include <cmath>
 #include <cstddef>
@@ -85,12 +86,16 @@
 }
 } // namespace
 
-NEScale::NEScale()
-    : _offsets(), _dx(), _dy()
+NEScale::NEScale() // NOLINT
+    : _offsets(),
+      _dx(),
+      _dy(),
+      _scale_kernel(),
+      _border_handler()
 {
 }
 
-void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == input);
     ARM_COMPUTE_ERROR_ON(nullptr == output);
@@ -116,8 +121,6 @@
         policy = InterpolationPolicy::NEAREST_NEIGHBOR;
     }
 
-    auto k = arm_compute::cpp14::make_unique<NEScaleKernel>();
-
     // Check if the border mode is UNDEFINED
     const bool border_undefined = border_mode == BorderMode::UNDEFINED;
 
@@ -128,7 +131,7 @@
             TensorInfo tensor_info_offsets(shape, Format::S32);
             _offsets.allocator()->init(tensor_info_offsets);
 
-            k->configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined);
+            _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
@@ -146,7 +149,7 @@
             _dx.allocator()->init(tensor_info_dxdy);
             _dy.allocator()->init(tensor_info_dxdy);
 
-            k->configure(input, &_dx, &_dy, &_offsets, output, policy, border_undefined);
+            _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_undefined);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
@@ -159,13 +162,18 @@
         }
         case InterpolationPolicy::AREA:
         {
-            k->configure(input, nullptr, nullptr, nullptr, output, policy, border_undefined);
+            _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, policy, border_undefined);
             break;
         }
         default:
             ARM_COMPUTE_ERROR("Unsupported interpolation mode");
     }
 
-    _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(input, _scale_kernel.border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+void NEScale::run()
+{
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    NEScheduler::get().schedule(&_scale_kernel, Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NEScharr3x3.cpp b/src/runtime/NEON/functions/NEScharr3x3.cpp
index 04b3f14..ba9985e 100644
--- a/src/runtime/NEON/functions/NEScharr3x3.cpp
+++ b/src/runtime/NEON/functions/NEScharr3x3.cpp
@@ -23,9 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEScharr3x3.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void NEScharr3x3::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<NEScharr3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEScharr3x3Kernel>();
     k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/NEON/functions/NESobel3x3.cpp b/src/runtime/NEON/functions/NESobel3x3.cpp
index 3b46fd7..753b1f6 100644
--- a/src/runtime/NEON/functions/NESobel3x3.cpp
+++ b/src/runtime/NEON/functions/NESobel3x3.cpp
@@ -23,9 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NESobel3x3.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NESobel3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void NESobel3x3::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<NESobel3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NESobel3x3Kernel>();
     k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/NEON/functions/NESobel5x5.cpp b/src/runtime/NEON/functions/NESobel5x5.cpp
index 8967a22..d8f4eda 100644
--- a/src/runtime/NEON/functions/NESobel5x5.cpp
+++ b/src/runtime/NEON/functions/NESobel5x5.cpp
@@ -32,8 +32,8 @@
 
 using namespace arm_compute;
 
-NESobel5x5::NESobel5x5()
-    : _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
+NESobel5x5::NESobel5x5(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
 {
 }
 
@@ -50,6 +50,8 @@
     {
         _tmp_x.allocator()->init(tensor_info);
         _tmp_y.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_x);
+        _memory_group.manage(&_tmp_y);
         _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
@@ -58,6 +60,7 @@
     else if(run_sobel_x)
     {
         _tmp_x.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_x);
         _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
@@ -65,6 +68,7 @@
     else if(run_sobel_y)
     {
         _tmp_y.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_y);
         _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_y.allocator()->allocate();
@@ -75,7 +79,12 @@
 
 void NESobel5x5::run()
 {
-    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+
+    _memory_group.acquire();
+
     NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
     NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NESobel7x7.cpp b/src/runtime/NEON/functions/NESobel7x7.cpp
index f628da9..5b6f60b 100644
--- a/src/runtime/NEON/functions/NESobel7x7.cpp
+++ b/src/runtime/NEON/functions/NESobel7x7.cpp
@@ -32,8 +32,8 @@
 
 using namespace arm_compute;
 
-NESobel7x7::NESobel7x7()
-    : _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
+NESobel7x7::NESobel7x7(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
 {
 }
 
@@ -50,6 +50,8 @@
     {
         _tmp_x.allocator()->init(tensor_info);
         _tmp_y.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_x);
+        _memory_group.manage(&_tmp_y);
         _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
@@ -58,6 +60,7 @@
     else if(run_sobel_x)
     {
         _tmp_x.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_x);
         _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
@@ -65,6 +68,7 @@
     else if(run_sobel_y)
     {
         _tmp_y.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_y);
         _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_y.allocator()->allocate();
@@ -75,7 +79,12 @@
 
 void NESobel7x7::run()
 {
-    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+
+    _memory_group.acquire();
+
     NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
     NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 0651eab..cc5d4e9 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -31,15 +31,14 @@
 
 using namespace arm_compute;
 
-NESoftmaxLayer::NESoftmaxLayer()
-    : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _fill_border_kernel(), _max(), _sum(), _tmp()
+NESoftmaxLayer::NESoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _fill_border_kernel(), _max(), _sum(), _tmp()
 {
 }
 
 void NESoftmaxLayer::configure(ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
 
     // Create intermediate tensors shapes
     TensorInfo tensor_info_tmp(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
@@ -51,11 +50,16 @@
     _max.allocator()->init(tensor_info_max_sum);
     _sum.allocator()->init(tensor_info_max_sum);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_tmp);
+    _memory_group.manage(&_max);
+    _memory_group.manage(&_sum);
+
     // Configure Kernels
     _max_kernel.configure(input, &_max);
     _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
     _norm_kernel.configure(&_tmp, &_sum, output);
-    _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::CONSTANT, PixelValue(-FLT_MAX));
+    _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::REPLICATE);
 
     // Allocate intermediate tensors
     _tmp.allocator()->allocate();
@@ -65,8 +69,12 @@
 
 void NESoftmaxLayer::run()
 {
+    _memory_group.acquire();
+
     NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
     NEScheduler::get().schedule(&_max_kernel, Window::DimY);
     NEScheduler::get().schedule(&_shift_exp_sum_kernel, Window::DimY);
     NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NETableLookup.cpp b/src/runtime/NEON/functions/NETableLookup.cpp
index ebb8a0a..cae117a 100644
--- a/src/runtime/NEON/functions/NETableLookup.cpp
+++ b/src/runtime/NEON/functions/NETableLookup.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NETableLookup.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NETableLookupKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NETableLookup::configure(const ITensor *input, const ILut *lut, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NETableLookupKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NETableLookupKernel>();
     k->configure(input, lut, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEThreshold.cpp b/src/runtime/NEON/functions/NEThreshold.cpp
index 93dc124..37883e5 100644
--- a/src/runtime/NEON/functions/NEThreshold.cpp
+++ b/src/runtime/NEON/functions/NEThreshold.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEThreshold.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEThresholdKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NEThreshold::configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
 {
-    auto k = arm_compute::cpp14::make_unique<NEThresholdKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEThresholdKernel>();
     k->configure(input, output, threshold, false_value, true_value, type, upper);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp
index 53ac9c5..eb81e02 100644
--- a/src/runtime/NEON/functions/NETranspose.cpp
+++ b/src/runtime/NEON/functions/NETranspose.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NETranspose::configure(const ITensor *input, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NETransposeKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NETransposeKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEWarpAffine.cpp b/src/runtime/NEON/functions/NEWarpAffine.cpp
index 24fb16f..889d827 100644
--- a/src/runtime/NEON/functions/NEWarpAffine.cpp
+++ b/src/runtime/NEON/functions/NEWarpAffine.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/runtime/NEON/functions/NEWarpAffine.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -41,14 +42,14 @@
     {
         case InterpolationPolicy::NEAREST_NEIGHBOR:
         {
-            auto k = arm_compute::cpp14::make_unique<NEWarpAffineKernel<InterpolationPolicy::NEAREST_NEIGHBOR>>();
+            auto k = arm_compute::support::cpp14::make_unique<NEWarpAffineKernel<InterpolationPolicy::NEAREST_NEIGHBOR>>();
             k->configure(input, output, matrix, border_mode, constant_border_value);
             _kernel = std::move(k);
             break;
         }
         case InterpolationPolicy::BILINEAR:
         {
-            auto k = arm_compute::cpp14::make_unique<NEWarpAffineKernel<InterpolationPolicy::BILINEAR>>();
+            auto k = arm_compute::support::cpp14::make_unique<NEWarpAffineKernel<InterpolationPolicy::BILINEAR>>();
             k->configure(input, output, matrix, border_mode, constant_border_value);
             _kernel = std::move(k);
             break;
diff --git a/src/runtime/NEON/functions/NEWarpPerspective.cpp b/src/runtime/NEON/functions/NEWarpPerspective.cpp
index 84b2df5..ed5d6a0 100644
--- a/src/runtime/NEON/functions/NEWarpPerspective.cpp
+++ b/src/runtime/NEON/functions/NEWarpPerspective.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/runtime/NEON/functions/NEWarpPerspective.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -41,14 +42,14 @@
     {
         case InterpolationPolicy::NEAREST_NEIGHBOR:
         {
-            auto k = arm_compute::cpp14::make_unique<NEWarpPerspectiveKernel<InterpolationPolicy::NEAREST_NEIGHBOR>>();
+            auto k = arm_compute::support::cpp14::make_unique<NEWarpPerspectiveKernel<InterpolationPolicy::NEAREST_NEIGHBOR>>();
             k->configure(input, output, matrix, border_mode, constant_border_value);
             _kernel = std::move(k);
             break;
         }
         case InterpolationPolicy::BILINEAR:
         {
-            auto k = arm_compute::cpp14::make_unique<NEWarpPerspectiveKernel<InterpolationPolicy::BILINEAR>>();
+            auto k = arm_compute::support::cpp14::make_unique<NEWarpPerspectiveKernel<InterpolationPolicy::BILINEAR>>();
             k->configure(input, output, matrix, border_mode, constant_border_value);
             _kernel = std::move(k);
             break;
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index 0cced73..be81641 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -38,7 +38,7 @@
     return scheduler;
 }
 
-OMPScheduler::OMPScheduler()
+OMPScheduler::OMPScheduler() // NOLINT
     : _num_threads(omp_get_max_threads())
 {
 }
diff --git a/src/runtime/PoolManager.cpp b/src/runtime/PoolManager.cpp
new file mode 100644
index 0000000..42cc943
--- /dev/null
+++ b/src/runtime/PoolManager.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/PoolManager.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/IMemoryPool.h"
+#include "support/ToolchainSupport.h"
+
+#include <list>
+
+using namespace arm_compute;
+
+PoolManager::PoolManager()
+    : _free_pools(), _occupied_pools(), _sem(), _mtx()
+{
+}
+
+IMemoryPool *PoolManager::lock_pool()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_free_pools.empty() && _occupied_pools.empty(), "Haven't setup any pools!");
+
+    _sem->wait();
+    std::lock_guard<arm_compute::Mutex> lock(_mtx);
+    ARM_COMPUTE_ERROR_ON_MSG(_free_pools.empty(), "Empty pool must exist as semaphore has been signalled");
+    _occupied_pools.splice(std::begin(_occupied_pools), _free_pools, std::begin(_free_pools));
+    return _occupied_pools.front().get();
+}
+
+void PoolManager::unlock_pool(IMemoryPool *pool)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_free_pools.empty() && _occupied_pools.empty(), "Haven't setup any pools!");
+
+    std::lock_guard<arm_compute::Mutex> lock(_mtx);
+    auto it = std::find_if(std::begin(_occupied_pools), std::end(_occupied_pools), [pool](const std::unique_ptr<IMemoryPool> &pool_it)
+    {
+        return pool_it.get() == pool;
+    });
+    ARM_COMPUTE_ERROR_ON_MSG(it == std::end(_occupied_pools), "Pool to be unlocked couldn't be found!");
+    _free_pools.splice(std::begin(_free_pools), _occupied_pools, it);
+    _sem->signal();
+}
+
+void PoolManager::register_pool(std::unique_ptr<IMemoryPool> pool)
+{
+    std::lock_guard<arm_compute::Mutex> lock(_mtx);
+    ARM_COMPUTE_ERROR_ON_MSG(!_occupied_pools.empty(), "All pools should be free in order to register a new one!");
+
+    // Set pool
+    _free_pools.push_front(std::move(pool));
+
+    // Update semaphore
+    _sem = arm_compute::support::cpp14::make_unique<arm_compute::Semaphore>(_free_pools.size());
+}
diff --git a/src/runtime/Pyramid.cpp b/src/runtime/Pyramid.cpp
index f1b6c93..ebd6570 100644
--- a/src/runtime/Pyramid.cpp
+++ b/src/runtime/Pyramid.cpp
@@ -24,10 +24,10 @@
 #include "arm_compute/runtime/Pyramid.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PyramidInfo.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
+#include "support/ToolchainSupport.h"
 
 #include <cmath>
 
@@ -46,7 +46,7 @@
 void Pyramid::internal_init(const PyramidInfo &info, bool auto_padding)
 {
     _info    = info;
-    _pyramid = arm_compute::cpp14::make_unique<Tensor[]>(_info.num_levels());
+    _pyramid = arm_compute::support::cpp14::make_unique<Tensor[]>(_info.num_levels());
 
     size_t      w            = _info.width();
     size_t      h            = _info.height();
diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
index a131928..505c4a3 100644
--- a/src/runtime/Scheduler.cpp
+++ b/src/runtime/Scheduler.cpp
@@ -26,13 +26,13 @@
 #include "arm_compute/core/Error.h"
 #if ARM_COMPUTE_CPP_SCHEDULER
 #include "arm_compute/runtime/CPP/CPPScheduler.h"
-#endif
+#endif /* ARM_COMPUTE_CPP_SCHEDULER */
 
 #include "arm_compute/runtime/SingleThreadScheduler.h"
 
 #if ARM_COMPUTE_OPENMP_SCHEDULER
 #include "arm_compute/runtime/OMP/OMPScheduler.h"
-#endif
+#endif /* ARM_COMPUTE_OPENMP_SCHEDULER */
 
 using namespace arm_compute;
 
@@ -42,9 +42,9 @@
 Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP;
 #elif ARM_COMPUTE_CPP_SCHEDULER && ARM_COMPUTE_OPENMP_SCHEDULER
 Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP;
-#else
+#else  /* ARM_COMPUTE_*_SCHEDULER */
 Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::ST;
-#endif
+#endif /* ARM_COMPUTE_*_SCHEDULER */
 
 void Scheduler::set(Type t)
 {
@@ -64,17 +64,17 @@
         {
 #if ARM_COMPUTE_CPP_SCHEDULER
             return true;
-#else
+#else  /* ARM_COMPUTE_CPP_SCHEDULER */
             return false;
-#endif
+#endif /* ARM_COMPUTE_CPP_SCHEDULER */
         }
         case Type::OMP:
         {
 #if ARM_COMPUTE_OPENMP_SCHEDULER
             return true;
-#else
+#else  /* ARM_COMPUTE_OPENMP_SCHEDULER */
             return false;
-#endif
+#endif /* ARM_COMPUTE_OPENMP_SCHEDULER */
         }
         case Type::CUSTOM:
         {
@@ -105,18 +105,18 @@
         {
 #if ARM_COMPUTE_CPP_SCHEDULER
             return CPPScheduler::get();
-#else
+#else  /* ARM_COMPUTE_CPP_SCHEDULER */
             ARM_COMPUTE_ERROR("Recompile with cppthreads=1 to use C++11 scheduler.");
-#endif
+#endif /* ARM_COMPUTE_CPP_SCHEDULER */
             break;
         }
         case Type::OMP:
         {
 #if ARM_COMPUTE_OPENMP_SCHEDULER
             return OMPScheduler::get();
-#else
+#else  /* ARM_COMPUTE_OPENMP_SCHEDULER */
             ARM_COMPUTE_ERROR("Recompile with openmp=1 to use openmp scheduler.");
-#endif
+#endif /* ARM_COMPUTE_OPENMP_SCHEDULER */
             break;
         }
         case Type::CUSTOM:
diff --git a/src/runtime/Tensor.cpp b/src/runtime/Tensor.cpp
index 435068c..a76c37e 100644
--- a/src/runtime/Tensor.cpp
+++ b/src/runtime/Tensor.cpp
@@ -26,7 +26,7 @@
 using namespace arm_compute;
 
 Tensor::Tensor()
-    : _allocator()
+    : _allocator(this)
 {
 }
 
diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index 5c719c7..272b9f5 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 
 #include <cstddef>
 
@@ -63,11 +64,50 @@
 }
 } // namespace
 
-TensorAllocator::TensorAllocator()
-    : _buffer(nullptr)
+TensorAllocator::TensorAllocator(Tensor *owner)
+    : _associated_memory_group(nullptr), _buffer(nullptr), _owner(owner)
 {
 }
 
+TensorAllocator::~TensorAllocator()
+{
+    if((_associated_memory_group == nullptr) && (_buffer != nullptr))
+    {
+        delete[] _buffer;
+        _buffer = nullptr;
+        info().set_is_resizable(true);
+    }
+}
+
+TensorAllocator::TensorAllocator(TensorAllocator &&o) noexcept
+    : ITensorAllocator(std::move(o)),
+      _associated_memory_group(o._associated_memory_group),
+      _buffer(o._buffer),
+      _owner(o._owner)
+{
+    o._associated_memory_group = nullptr;
+    o._buffer                  = nullptr;
+    o._owner                   = nullptr;
+}
+
+TensorAllocator &TensorAllocator::operator=(TensorAllocator &&o) noexcept
+{
+    if(&o != this)
+    {
+        _associated_memory_group   = o._associated_memory_group;
+        o._associated_memory_group = nullptr;
+
+        _buffer   = o._buffer;
+        o._buffer = nullptr;
+
+        _owner   = o._owner;
+        o._owner = nullptr;
+
+        ITensorAllocator::operator=(std::move(o));
+    }
+    return *this;
+}
+
 void TensorAllocator::init(const TensorAllocator &allocator, const Coordinates &coords, TensorInfo sub_info)
 {
     // Get parent info
@@ -90,28 +130,44 @@
 
 uint8_t *TensorAllocator::data() const
 {
-    return (_buffer != nullptr) ? _buffer.get()->data() : nullptr;
+    return _buffer;
 }
 
 void TensorAllocator::allocate()
 {
     ARM_COMPUTE_ERROR_ON(_buffer != nullptr);
-
-    _buffer = std::make_shared<std::vector<uint8_t>>(info().total_size());
+    if(_associated_memory_group == nullptr)
+    {
+        _buffer = new uint8_t[info().total_size()]();
+    }
+    else
+    {
+        _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(&_buffer), info().total_size());
+    }
     info().set_is_resizable(false);
 }
 
 void TensorAllocator::free()
 {
-    ARM_COMPUTE_ERROR_ON(_buffer == nullptr);
+    if((_associated_memory_group == nullptr) && (_buffer != nullptr))
+    {
+        delete[] _buffer;
+        _buffer = nullptr;
+        info().set_is_resizable(true);
+    }
+}
 
-    _buffer.reset();
-    info().set_is_resizable(true);
+void TensorAllocator::set_associated_memory_group(MemoryGroup *associated_memory_group)
+{
+    ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
+    ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr);
+    ARM_COMPUTE_ERROR_ON(_buffer != nullptr);
+    _associated_memory_group = associated_memory_group;
 }
 
 uint8_t *TensorAllocator::lock()
 {
-    return (_buffer != nullptr) ? _buffer.get()->data() : nullptr;
+    return _buffer;
 }
 
 void TensorAllocator::unlock()