arm_compute v19.02 Change-Id: I853a3ecf38f206da13c1b03640c8adf73c20477c

commit: 514be65ad8d3340f53fd9591035352ed285811ba [log] [tgz]
author: Jenkins <bsgcomp@arm.com> Thu Feb 28 12:25:18 2019 +0000
committer: Anthony Barbier <anthony.barbier@arm.com> Thu Feb 28 13:38:08 2019 +0000
tree: abe236598d76078a537fd247813e287d5bf34acd
parent: 3d2d44ef55ab6b08afda8be48301ce3c55c7bc67 [diff]
diff --git a/src/runtime/Allocator.cpp b/src/runtime/Allocator.cpp
index 7f0e374..d9de11e 100644
--- a/src/runtime/Allocator.cpp
+++ b/src/runtime/Allocator.cpp

@@ -44,6 +44,5 @@
 
 std::unique_ptr<IMemoryRegion> Allocator::make_region(size_t size, size_t alignment)
 {
-    ARM_COMPUTE_UNUSED(alignment);
-    return arm_compute::support::cpp14::make_unique<MemoryRegion>(size);
+    return arm_compute::support::cpp14::make_unique<MemoryRegion>(size, alignment);
 }
\ No newline at end of file

diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp
index 2a4ab6e..c5d42b1 100644
--- a/src/runtime/BlobLifetimeManager.cpp
+++ b/src/runtime/BlobLifetimeManager.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,6 @@
 #include <algorithm>
 #include <cmath>
 #include <map>
-#include <vector>
 
 using namespace arm_compute;
 
@@ -62,19 +61,21 @@
     {
         return ba.max_size > bb.max_size;
     });
-    std::vector<size_t> group_sizes;
+
+    // Create group sizes vector
+    std::vector<BlobInfo> group_sizes;
     std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes), [](const Blob & b)
     {
-        return b.max_size;
+        return BlobInfo(b.max_size, b.max_alignment);
     });
 
     // Update blob sizes
     size_t max_size = std::max(_blobs.size(), group_sizes.size());
-    _blobs.resize(max_size, 0);
-    group_sizes.resize(max_size, 0);
-    std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), [](size_t lhs, size_t rhs)
+    _blobs.resize(max_size);
+    group_sizes.resize(max_size);
+    std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), [](BlobInfo lhs, BlobInfo rhs)
     {
-        return std::max(lhs, rhs);
+        return BlobInfo(std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment));
     });
 
     // Calculate group mappings

diff --git a/src/runtime/BlobMemoryPool.cpp b/src/runtime/BlobMemoryPool.cpp
index e09451c..812cbdd 100644
--- a/src/runtime/BlobMemoryPool.cpp
+++ b/src/runtime/BlobMemoryPool.cpp

@@ -33,11 +33,11 @@
 
 using namespace arm_compute;
 
-BlobMemoryPool::BlobMemoryPool(IAllocator *allocator, std::vector<size_t> blob_sizes)
-    : _allocator(allocator), _blobs(), _blob_sizes(std::move(blob_sizes))
+BlobMemoryPool::BlobMemoryPool(IAllocator *allocator, std::vector<BlobInfo> blob_info)
+    : _allocator(allocator), _blobs(), _blob_info(std::move(blob_info))
 {
     ARM_COMPUTE_ERROR_ON(!allocator);
-    allocate_blobs(_blob_sizes);
+    allocate_blobs(_blob_info);
 }
 
 BlobMemoryPool::~BlobMemoryPool()
@@ -73,16 +73,16 @@
 std::unique_ptr<IMemoryPool> BlobMemoryPool::duplicate()
 {
     ARM_COMPUTE_ERROR_ON(!_allocator);
-    return support::cpp14::make_unique<BlobMemoryPool>(_allocator, _blob_sizes);
+    return support::cpp14::make_unique<BlobMemoryPool>(_allocator, _blob_info);
 }
 
-void BlobMemoryPool::allocate_blobs(const std::vector<size_t> &sizes)
+void BlobMemoryPool::allocate_blobs(const std::vector<BlobInfo> &blob_info)
 {
     ARM_COMPUTE_ERROR_ON(!_allocator);
 
-    for(const auto &size : sizes)
+    for(const auto &bi : blob_info)
     {
-        _blobs.push_back(_allocator->make_region(size, 0));
+        _blobs.push_back(_allocator->make_region(bi.size, bi.alignment));
     }
 }
 

diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp
new file mode 100644
index 0000000..533e6fa
--- /dev/null
+++ b/src/runtime/CL/CLHelpers.cpp

@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/CLHelpers.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/Error.h"
+
+namespace
+{
+#if defined(ARM_COMPUTE_ASSERTS_ENABLED)
+void printf_callback(const char *buffer, unsigned int len, size_t complete, void *user_data)
+{
+    printf("%.*s", len, buffer);
+}
+#endif /* defined(ARM_COMPUTE_ASSERTS_ENABLED) */
+
+/** This initialises the properties vector with the configuration to be used when creating the opencl context
+ *
+ * @param[in] platform The opencl platform used to create the context
+ * @param[in] device   The opencl device to be used to create the context
+ * @param[in] prop     An array of properties to be initialised
+ *
+ * @note In debug builds, this function will enable cl_arm_printf if it's supported.
+ *
+ * @return A pointer to the context properties which can be used to create an opencl context
+ */
+
+void initialise_context_properties(const cl::Platform &platform, const cl::Device &device, cl_context_properties prop[7])
+{
+    ARM_COMPUTE_UNUSED(device);
+#if defined(ARM_COMPUTE_ASSERTS_ENABLED)
+    // Query devices in the context for cl_arm_printf support
+    if(arm_compute::device_supports_extension(device, "cl_arm_printf"))
+    {
+        // Create a cl_context with a printf_callback and user specified buffer size.
+        cl_context_properties properties_printf[] =
+        {
+            CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
+            // Enable a printf callback function for this context.
+            CL_PRINTF_CALLBACK_ARM, reinterpret_cast<cl_context_properties>(printf_callback),
+            // Request a minimum printf buffer size of 4MB for devices in the
+            // context that support this extension.
+            CL_PRINTF_BUFFERSIZE_ARM, 0x1000,
+            0
+        };
+        std::copy_n(properties_printf, 7, prop);
+    }
+    else
+#endif // defined(ARM_COMPUTE_ASSERTS_ENABLED)
+    {
+        cl_context_properties properties[] =
+        {
+            CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
+            0
+        };
+        std::copy_n(properties, 3, prop);
+    };
+}
+} //namespace
+
+namespace arm_compute
+{
+std::tuple<cl::Context, cl::Device, cl_int>
+create_opencl_context_and_device()
+{
+    ARM_COMPUTE_ERROR_ON(!opencl_is_available());
+    std::vector<cl::Platform> platforms;
+    cl::Platform::get(&platforms);
+    ARM_COMPUTE_ERROR_ON_MSG(platforms.size() == 0, "Couldn't find any OpenCL platform");
+    cl::Platform            p = platforms[0];
+    cl::Device              device;
+    std::vector<cl::Device> platform_devices;
+    p.getDevices(CL_DEVICE_TYPE_DEFAULT, &platform_devices);
+    ARM_COMPUTE_ERROR_ON_MSG(platform_devices.size() == 0, "Couldn't find any OpenCL device");
+    device                              = platform_devices[0];
+    cl_int                err           = CL_SUCCESS;
+    cl_context_properties properties[7] = { 0, 0, 0, 0, 0, 0, 0 };
+    initialise_context_properties(p, device, properties);
+    cl::Context cl_context = cl::Context(device, properties, nullptr, nullptr, &err);
+    ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context");
+    return std::make_tuple(cl_context, device, err);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index a311c6f..701ffe0 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,22 +23,14 @@
  */
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "arm_compute/runtime/CL/CLHelpers.h"
+
 #include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
 #include "arm_compute/runtime/CL/tuners/Tuners.h"
 
 using namespace arm_compute;
 
-namespace
-{
-#if defined(ARM_COMPUTE_DEBUG_ENABLED)
-void printf_callback(const char *buffer, unsigned int len, size_t complete, void *user_data)
-{
-    printf("%.*s", len, buffer);
-}
-#endif /* defined(ARM_COMPUTE_DEBUG_ENABLED) */
-} // namespace
-
 std::once_flag CLScheduler::_initialize_symbols;
 
 CLScheduler::CLScheduler()
@@ -53,53 +45,30 @@
     return scheduler;
 }
 
+void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx, ICLTuner *cl_tuner)
+{
+    if(!_is_initialised)
+    {
+        cl::CommandQueue queue = cl::CommandQueue(ctx, device);
+        CLKernelLibrary::get().init("./cl_kernels/", ctx, device);
+        init(ctx, queue, device, cl_tuner);
+        _cl_default_static_tuner = tuners::TunerFactory::create_tuner(_target);
+        _cl_tuner                = (cl_tuner == nullptr) ? _cl_default_static_tuner.get() : cl_tuner;
+    }
+}
+
 void CLScheduler::default_init(ICLTuner *cl_tuner)
 {
     if(!_is_initialised)
     {
-        std::vector<cl::Platform> platforms;
-        cl::Platform::get(&platforms);
-        ARM_COMPUTE_ERROR_ON_MSG(platforms.size() == 0, "Couldn't find any OpenCL platform");
-        cl::Platform            p = platforms[0];
-        cl::Context             ctx;
-        cl::Device              device;
-        std::vector<cl::Device> platform_devices;
-        p.getDevices(CL_DEVICE_TYPE_DEFAULT, &platform_devices);
-        ARM_COMPUTE_ERROR_ON_MSG(platform_devices.size() == 0, "Couldn't find any OpenCL device");
-        device = platform_devices[0];
-#if defined(ARM_COMPUTE_DEBUG_ENABLED)
-
-        // Query devices in the context for cl_arm_printf support
-        if(device_supports_extension(device, "cl_arm_printf"))
-        {
-            // Create a cl_context with a printf_callback and user specified buffer size.
-            cl_context_properties properties[] =
-            {
-                CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(p()),
-                // Enable a printf callback function for this context.
-                CL_PRINTF_CALLBACK_ARM, reinterpret_cast<cl_context_properties>(printf_callback),
-                // Request a minimum printf buffer size of 4MB for devices in the
-                // context that support this extension.
-                CL_PRINTF_BUFFERSIZE_ARM, 0x1000,
-                0
-            };
-            ctx = cl::Context(device, properties);
-        }
-        else
-#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
-        {
-            cl_context_properties properties[] =
-            {
-                CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(p()),
-                0
-            };
-            ctx = cl::Context(device, properties);
-        };
-
-        cl::CommandQueue queue = cl::CommandQueue(ctx, device);
-        CLKernelLibrary::get().init("./cl_kernels/", ctx, device);
-        init(ctx, queue, device, cl_tuner);
-
+        cl::Context ctx;
+        cl::Device  dev;
+        cl_int      err;
+        std::tie(ctx, dev, err) = create_opencl_context_and_device();
+        ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context");
+        cl::CommandQueue queue = cl::CommandQueue(ctx, dev);
+        CLKernelLibrary::get().init("./cl_kernels/", ctx, dev);
+        init(ctx, queue, dev, cl_tuner);
         // Create a default static tuner and set if none was provided
         _cl_default_static_tuner = tuners::TunerFactory::create_tuner(_target);
     }
@@ -108,6 +77,21 @@
     _cl_tuner = (cl_tuner == nullptr) ? _cl_default_static_tuner.get() : cl_tuner;
 }
 
+void CLScheduler::set_context(cl::Context context)
+{
+    _context = std::move(context);
+    CLKernelLibrary::get().set_context(_context);
+}
+
+void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::Device &device, ICLTuner *cl_tuner)
+{
+    set_context(std::move(context));
+    _queue          = std::move(queue);
+    _target         = get_target_from_device(device);
+    _is_initialised = true;
+    _cl_tuner       = cl_tuner;
+}
+
 void CLScheduler::enqueue(ICLKernel &kernel, bool flush)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised,

diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index 5f82cd3..a262d6b 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,10 +33,40 @@
 #include <limits>
 #include <string>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+namespace
+{
+/** Utility function used to initialize the LWS values to test.
+ *  Only the LWS values which are power of 2 or satisfy the modulo conditions with GWS are taken into account by the CLTuner
+ *
+ * @param[in, out] lws         Vector of LWS to test for a specific dimension
+ * @param[in]      gws         Size of the GWS
+ * @param[in]      lws_max     Max LKWS value allowed to be tested
+ * @param[in]      mod_let_one True if the results of the modulo operation between gws and the lws can be less than one.
+ */
+void initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one)
+{
+    lws.push_back(1);
+
+    for(unsigned int i = 2; i <= lws_max; ++i)
+    {
+        // Power of two condition
+        const bool is_power_of_two = (i & (i - 1)) == 0;
+
+        // Condition for the module accordingly with the mod_let_one flag
+        const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0;
+
+        if(mod_cond || is_power_of_two)
+        {
+            lws.push_back(i);
+        }
+    }
+}
+} // namespace
 
 CLTuner::CLTuner(bool tune_new_kernels)
-    : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _queue(), _queue_profiler(), _kernel_event(), _tune_new_kernels(tune_new_kernels)
+    : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels)
 {
 }
 
@@ -102,32 +132,35 @@
 
 cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
 {
+    // Profiling queue
+    cl::CommandQueue queue_profiler;
+
+    // Extract real OpenCL function to intercept
     if(real_clEnqueueNDRangeKernel == nullptr)
     {
         real_clEnqueueNDRangeKernel = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
-
-        // Get the default queue
-        _queue = CLScheduler::get().queue();
-
-        // Check if we can use the OpenCL timer with the default queue
-        cl_command_queue_properties props = _queue.getInfo<CL_QUEUE_PROPERTIES>();
-
-        if((props & CL_QUEUE_PROFILING_ENABLE) == 0)
-        {
-            // Set the queue for profiling
-            _queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE);
-        }
-        else
-        {
-            _queue_profiler = _queue;
-        }
     }
+
+    // Get the default queue
+    cl::CommandQueue default_queue = CLScheduler::get().queue();
+
+    // Check if we can use the OpenCL timer with the default queue
+    cl_command_queue_properties props = default_queue.getInfo<CL_QUEUE_PROPERTIES>();
+
+    if((props & CL_QUEUE_PROFILING_ENABLE) == 0)
+    {
+        // Set the queue for profiling
+        queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE);
+    }
+    else
+    {
+        queue_profiler = default_queue;
+    }
+
     // Start intercepting enqueues:
     auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
                               const cl_event * event_wait_list, cl_event * event)
     {
-        ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported");
-        ARM_COMPUTE_UNUSED(event);
         if(this->kernel_event_is_set())
         {
             // If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues.
@@ -139,49 +172,45 @@
         // Set OpenCL event
         this->set_cl_kernel_event(tmp);
 
+        if(event != nullptr)
+        {
+            //return cl_event from the intercepted call
+            clRetainEvent(tmp);
+            *event = tmp;
+        }
         return retval;
     };
     CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
 
     cl_ulong min_exec_time = std::numeric_limits<cl_ulong>::max();
 
+    cl::NDRange gws     = ICLKernel::gws_from_window(kernel.window());
     cl::NDRange opt_lws = cl::NullRange;
 
-    const int x_step = std::max(1, kernel.window().x().step());
-    const int y_step = std::max(1, kernel.window().y().step());
-    const int z_step = std::max(1, kernel.window().z().step());
-    const int x_end  = kernel.window().x().end() - kernel.window().x().start() / x_step > 1 ? 16 : 1;
-    const int y_end  = kernel.window().y().end() - kernel.window().y().start() / y_step > 1 ? 16 : 1;
-    const int z_end  = kernel.window().z().end() - kernel.window().z().start() / z_step > 1 ? 8 : 1;
+    const unsigned int lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 64u);
+    const unsigned int lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 32u);
+    const unsigned int lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 32u);
 
-    // First run using the default LWS
+    std::vector<unsigned int> lws_x;
+    std::vector<unsigned int> lws_y;
+    std::vector<unsigned int> lws_z;
+
+    // Initialize the LWS values to test
+    initialize_lws_values(lws_x, gws[0], lws_x_max, gws[2] > 16);
+    initialize_lws_values(lws_y, gws[1], lws_y_max, gws[2] > 16);
+    initialize_lws_values(lws_z, gws[2], lws_z_max, false);
+
+    for(const auto &z : lws_z)
     {
-        cl::NDRange lws_test = cl::NullRange;
-
-        kernel.set_lws_hint(lws_test);
-
-        // Run the kernel
-        kernel.run(kernel.window(), _queue_profiler);
-
-        _queue_profiler.finish();
-
-        const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
-        const cl_ulong end   = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
-        const cl_ulong diff  = end - start;
-        _kernel_event        = nullptr;
-
-        min_exec_time = diff;
-    }
-
-    for(int z = 1; z <= z_end; ++z)
-    {
-        for(int y = 1; y <= y_end; ++y)
+        for(const auto &y : lws_y)
         {
-            for(int x = 1; x <= x_end; ++x)
+            for(const auto &x : lws_x)
             {
                 cl::NDRange lws_test = cl::NDRange(x, y, z);
 
-                const bool invalid_lws = (x * y * z > static_cast<int>(kernel.get_max_workgroup_size())) || (x == 1 && y == 1 && z == 1);
+                bool invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1);
+
+                invalid_lws = invalid_lws || (x > gws[0]) || (y > gws[1]) || (z > gws[2]);
 
                 if(invalid_lws)
                 {
@@ -192,9 +221,9 @@
                 kernel.set_lws_hint(lws_test);
 
                 // Run the kernel
-                kernel.run(kernel.window(), _queue_profiler);
+                kernel.run(kernel.window(), queue_profiler);
 
-                _queue_profiler.finish();
+                queue_profiler.finish();
 
                 const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
                 const cl_ulong end   = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
@@ -278,3 +307,4 @@
     }
     fs.close();
 }
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
new file mode 100644
index 0000000..a6393c5
--- /dev/null
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp

@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel>();
+    k->configure(input, output, axis, op);
+    _kernel = std::move(k);
+}
+
+Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
+    return CLReductionOperationKernel::validate(input, output, axis, op);
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLArithmeticAddition.cpp b/src/runtime/CL/functions/CLArithmeticAddition.cpp
deleted file mode 100644
index 0b05058..0000000
--- a/src/runtime/CL/functions/CLArithmeticAddition.cpp
+++ /dev/null

@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLArithmeticAddition.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticAdditionKernel>();
-    k->configure(input1, input2, output, policy);
-    _kernel = std::move(k);
-
-    if(output->info()->dimension(0) > 1)
-    {
-        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
-        if(broadcasted_info->info()->dimension(0) == 1)
-        {
-            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-        }
-    }
-}
-
-Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
-{
-    return CLArithmeticAdditionKernel::validate(input1, input2, output, policy);
-}

diff --git a/src/runtime/CL/functions/CLArithmeticDivision.cpp b/src/runtime/CL/functions/CLArithmeticDivision.cpp
deleted file mode 100644
index 1c2849c..0000000
--- a/src/runtime/CL/functions/CLArithmeticDivision.cpp
+++ /dev/null

@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLArithmeticDivision.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLArithmeticDivisionKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticDivisionKernel>();
-    k->configure(input1, input2, output);
-    _kernel = std::move(k);
-
-    if(output->info()->dimension(0) > 1)
-    {
-        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
-        if(broadcasted_info->info()->dimension(0) == 1)
-        {
-            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-        }
-    }
-}
-
-Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
-    return CLArithmeticDivisionKernel::validate(input1, input2, output);
-}

diff --git a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
deleted file mode 100644
index e661f6a..0000000
--- a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
+++ /dev/null

@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLArithmeticSubtraction::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticSubtractionKernel>();
-    k->configure(input1, input2, output, policy);
-    _kernel = std::move(k);
-
-    if(output->info()->dimension(0) > 1)
-    {
-        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
-        if(broadcasted_info->info()->dimension(0) == 1)
-        {
-            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-        }
-    }
-}
-
-Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
-{
-    return CLArithmeticSubtractionKernel::validate(input1, input2, output, policy);
-}

diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp
new file mode 100644
index 0000000..e0ffcdb
--- /dev/null
+++ b/src/runtime/CL/functions/CLCast.cpp

@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLCast.h"
+
+#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void CLCast::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertLayerKernel>();
+    k->configure(input, output, policy, 0);
+    _kernel = std::move(k);
+}
+
+Status CLCast::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy)
+{
+    return CLDepthConvertLayerKernel::validate(input, output, policy, 0);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp
new file mode 100644
index 0000000..86c9c31
--- /dev/null
+++ b/src/runtime/CL/functions/CLComparison.cpp

@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLComparison.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLComparisonKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLComparison::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLComparisonKernel>();
+    k->configure(input1, input2, output, operation);
+    _kernel = std::move(k);
+
+    if(output->info()->dimension(0) > 1)
+    {
+        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+        if(broadcasted_info->info()->dimension(0) == 1)
+        {
+            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+        }
+    }
+}
+
+Status CLComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation)
+{
+    return CLComparisonKernel::validate(input1, input2, output, operation);
+}
+
+template <ComparisonOperation COP>
+void CLComparisonStatic<COP>::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLComparisonKernel>();
+    k->configure(input1, input2, output, COP);
+    _kernel = std::move(k);
+
+    if(output->info()->dimension(0) > 1)
+    {
+        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+        if(broadcasted_info->info()->dimension(0) == 1)
+        {
+            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+        }
+    }
+}
+
+template <ComparisonOperation COP>
+Status CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return CLComparisonKernel::validate(input1, input2, output, COP);
+}
+
+// Supported Specializations
+template class CLComparisonStatic<ComparisonOperation::Equal>;
+template class CLComparisonStatic<ComparisonOperation::NotEqual>;
+template class CLComparisonStatic<ComparisonOperation::Greater>;
+template class CLComparisonStatic<ComparisonOperation::GreaterEqual>;
+template class CLComparisonStatic<ComparisonOperation::Less>;
+template class CLComparisonStatic<ComparisonOperation::LessEqual>;
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLComputeAllAnchors.cpp b/src/runtime/CL/functions/CLComputeAllAnchors.cpp
index 409d3c9..24c152f 100644
--- a/src/runtime/CL/functions/CLComputeAllAnchors.cpp
+++ b/src/runtime/CL/functions/CLComputeAllAnchors.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index e07feb2..9da02c1 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -158,6 +158,18 @@
     _scaled_output.allocator()->allocate();
 }
 
+void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
+                                     const WeightsInfo &weights_info)
+{
+    configure(input, weights, bias, output, info, 0, 0, weights_info);
+}
+
+Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+                                      const WeightsInfo &weights_info)
+{
+    return CLDeconvolutionLayer::validate(input, weights, bias, output, info, 0, 0, weights_info);
+}
+
 void CLDeconvolutionLayer::run()
 {
     prepare();

diff --git a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
index b5e8fd9..e46647a 100644
--- a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -66,7 +66,7 @@
     for(unsigned int i = 0; i < _num_inputs; i++)
     {
         _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
-        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue());
 
         depth_offset += inputs_vector.at(i)->info()->dimension(2);
     }

diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
index 2e52e8a..dbf71ac 100644
--- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp

@@ -28,8 +28,8 @@
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void CLDepthConvertLayer::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertLayerKernel>();
@@ -41,3 +41,4 @@
 {
     return CLDepthConvertLayerKernel::validate(input, output, policy, shift);
 }
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 497cdae..15cbfce 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,18 +26,21 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
 #include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace arm_compute::misc;
 using namespace arm_compute::misc::shape_calculator;
 
-CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3()
-    : _kernel(nullptr), _border_handler()
+CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _kernel(nullptr), _border_handler(), _permute_input_to_nchw(), _permute_weights_to_nchw(), _permute_output_to_nhwc(), _reshape_weights(), _permuted_input(),
+      _permuted_weights(), _permuted_output(), _original_weights(nullptr), _needs_permute(false), _needs_weights_reshape(false), _is_prepared(false)
 {
 }
 
@@ -47,25 +50,79 @@
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
 
-    if(input->info()->data_layout() == DataLayout::NCHW)
+    const bool is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
+
+    _needs_permute         = is_nhwc && (depth_multiplier > 1);
+    _needs_weights_reshape = is_nhwc && (depth_multiplier == 1)
+                             && is_data_type_quantized_asymmetric(input->info()->data_type());
+    _is_prepared      = false;
+    _original_weights = weights;
+
+    ICLTensor       *input_to_use   = input;
+    const ICLTensor *weights_to_use = weights;
+    ICLTensor       *output_to_use  = output;
+
+    const bool                      is_stride_1       = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+    const bool                      is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+    DepthwiseConvolutionReshapeInfo info;
+    info.c0        = 4;
+    info.transpose = is_stride_1 && is_dot8_supported;
+
+    if(_needs_permute)
     {
+        _memory_group.manage(&_permuted_input);
+        _memory_group.manage(&_permuted_output);
+
+        // Configure the function to transform the input tensor from NHWC -> NCHW
+        _permute_input_to_nchw.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+        _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+        // Configure the function to transform the weights tensor from HWI -> IHW
+        _permute_weights_to_nchw.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+        _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+
+        input_to_use   = &_permuted_input;
+        weights_to_use = &_permuted_weights;
+        output_to_use  = &_permuted_output;
+
         _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
     }
+    else if(is_nhwc)
+    {
+        if(_needs_weights_reshape)
+        {
+            _reshape_weights.configure(weights, &_permuted_weights, info);
+            weights_to_use = &_permuted_weights;
+        }
+        _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
+    }
     else
     {
-        _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
+        _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
     }
 
+    // Configure kernel
     _kernel->set_target(CLScheduler::get().target());
-    _kernel->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+    _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, act_info);
 
+    // Permute output if needed
+    if(_needs_permute)
+    {
+        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+        _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+        _permute_output_to_nhwc.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+
+        // Allocate tensors
+        _permuted_input.allocator()->allocate();
+        _permuted_output.allocator()->allocate();
+    }
     // Configure border handler
     PixelValue &&zero_value(0.f);
     if(is_data_type_quantized_asymmetric(input->info()->data_type()))
     {
         zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().offset));
     }
-    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
+    _border_handler.configure(input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
 }
 
 Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
@@ -75,23 +132,99 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
 
-    if(input->data_layout() == DataLayout::NCHW)
+    const bool                      is_nhwc               = input->data_layout() == DataLayout::NHWC;
+    const bool                      needs_permute         = is_nhwc && (depth_multiplier > 1);
+    const bool                      needs_weights_reshape = is_nhwc && (depth_multiplier == 1);
+    const bool                      is_stride_1           = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+    const bool                      is_dot8_supported     = dot8_supported(CLKernelLibrary::get().get_device());
+    DepthwiseConvolutionReshapeInfo info;
+    info.c0        = 4;
+    info.transpose = is_stride_1 && is_dot8_supported;
+
+    if(needs_permute)
     {
-        return CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target);
+        TensorShape permuted_input_shape   = input->tensor_shape();
+        TensorShape permuted_weights_shape = weights->tensor_shape();
+        TensorShape permuted_output_shape  = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+
+        permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
+        permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
+        permute(permuted_output_shape, PermutationVector(1U, 2U, 0U));
+
+        const TensorInfo permuted_input   = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NCHW);
+        const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW);
+        const TensorInfo permuted_output  = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW);
+
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, act_info, gpu_target));
+    }
+    else if(is_nhwc)
+    {
+        if(needs_weights_reshape)
+        {
+            auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*weights, info);
+            ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases, output, conv_info, depth_multiplier,
+                                                                                           act_info));
+        }
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target));
     }
 
-    return CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+    return Status{};
 }
 
 void CLDepthwiseConvolutionLayer3x3::run()
 {
+    prepare();
+
+    _memory_group.acquire();
+
+    if(_needs_permute)
+    {
+        _permute_input_to_nchw.run();
+    }
     CLScheduler::get().enqueue(_border_handler);
     CLScheduler::get().enqueue(*_kernel);
+
+    if(_needs_permute)
+    {
+        _permute_output_to_nhwc.run();
+    }
+
+    _memory_group.release();
+}
+
+void CLDepthwiseConvolutionLayer3x3::prepare()
+{
+    if(!_is_prepared)
+    {
+        if(_needs_permute)
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+            _permuted_weights.allocator()->allocate();
+            _permute_weights_to_nchw.run();
+            _original_weights->mark_as_unused();
+        }
+
+        if(_needs_weights_reshape)
+        {
+            ARM_COMPUTE_ERROR_ON(_needs_permute);
+            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+            _permuted_weights.allocator()->allocate();
+            CLScheduler::get().enqueue(_reshape_weights);
+            _original_weights->mark_as_unused();
+        }
+        _is_prepared = true;
+    }
 }
 
 CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer()
     : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _activationlayer_function(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(),
-      _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _is_activationlayer_enabled(false), _original_weights(nullptr)
+      _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _is_activationlayer_enabled(false), _original_weights(nullptr),
+      _optimised_function(nullptr)
 {
 }
 
@@ -104,98 +237,110 @@
 
     const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
-    const size_t idx_c = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
 
-    const size_t weights_w = weights->info()->dimension(idx_w);
-    const size_t weights_h = weights->info()->dimension(idx_h);
-    const size_t weights_z = weights->info()->dimension(idx_c);
+    const bool can_run_optimised_3x3_kernel = (weights->info()->dimension(idx_w) == 3) && (weights->info()->dimension(idx_h) == 3);
 
-    _is_prepared      = false;
-    _original_weights = weights;
-    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
-
-    bool            append_bias = (biases != nullptr) && !_is_quantized;
-    const GPUTarget gpu_target  = CLScheduler::get().target();
-
-    // Calculate output shape
-    TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-
-    // Output width and height
-    const unsigned int conv_w = output_shape[idx_w];
-    const unsigned int conv_h = output_shape[idx_h];
-
-    // Set up intermediate tensors
-    const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
-    const size_t conv_size  = conv_w * conv_h;
-
-    // Im2Col configuration
-    TensorShape shape_im2col = input->info()->tensor_shape();
-    shape_im2col.set(0, patch_size);
-    shape_im2col.set(1, conv_size);
-    shape_im2col.set(2, weights_z);
-    _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
-    _im2col_kernel.set_target(gpu_target);
-    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
-    CLScheduler::get().tune_kernel_static(_im2col_kernel);
-
-    // Weights reshape configuration
-    const TensorShape shape_weights_reshape(patch_size, weights_z);
-    _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
-    _weights_reshape_kernel.configure(weights, &_weights_reshaped, append_bias ? biases : nullptr);
-
-    // GEMV configuration
-    DataType    v2mm_dt        = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
-    TensorShape shape_v2mm_out = input->info()->tensor_shape();
-    shape_v2mm_out.set(0, conv_size * weights_z);
-    shape_v2mm_out.set(1, 1);
-    shape_v2mm_out.set(2, 1);
-    _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
-    _v2mm_kernel.set_target(gpu_target);
-    _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
-    CLScheduler::get().tune_kernel_static(_v2mm_kernel);
-    _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
-    _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
-
-    // Output staged configuration
-    if(_is_quantized)
+    if(bool(can_run_optimised_3x3_kernel))
     {
-        const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
-
-        float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
-        int   output_multiplier, output_shift;
-        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-        _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
-        _output_reshaped.allocator()->allocate();
+        auto f = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3>();
+        f->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+        _optimised_function = std::move(f);
     }
-
-    // Fill borders on inputs
-    PixelValue zero_in(static_cast<int32_t>(0));
-    PixelValue zero_w(static_cast<int32_t>(0));
-    if(_is_quantized)
+    else
     {
-        zero_in = PixelValue(static_cast<int32_t>(input->info()->quantization_info().offset));
-        zero_w  = PixelValue(static_cast<int32_t>(weights->info()->quantization_info().offset));
-    }
-    BorderSize border_size = _v2mm_kernel.border_size();
-    _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, zero_in);
+        const size_t idx_c = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
 
-    border_size.bottom = 0;
-    _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, zero_w);
+        const size_t weights_w = weights->info()->dimension(idx_w);
+        const size_t weights_h = weights->info()->dimension(idx_h);
+        const size_t weights_z = weights->info()->dimension(idx_c);
 
-    // Allocate intermediate tensors
-    _input_reshaped.allocator()->allocate();
-    _v2mm_output.allocator()->allocate();
+        _is_prepared      = false;
+        _original_weights = weights;
+        _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
 
-    //Configure Activation Layer
-    _is_activationlayer_enabled = act_info.enabled();
+        bool            append_bias = (biases != nullptr) && !_is_quantized;
+        const GPUTarget gpu_target  = CLScheduler::get().target();
 
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.configure(output, nullptr, act_info);
+        // Calculate output shape
+        TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+
+        // Output auto inizialitation if not yet initialized
+        auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+        // Output width and height
+        const unsigned int conv_w = output_shape[idx_w];
+        const unsigned int conv_h = output_shape[idx_h];
+
+        // Set up intermediate tensors
+        const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
+        const size_t conv_size  = conv_w * conv_h;
+
+        // Im2Col configuration
+        TensorShape shape_im2col = input->info()->tensor_shape();
+        shape_im2col.set(0, patch_size);
+        shape_im2col.set(1, conv_size);
+        shape_im2col.set(2, weights_z);
+        _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+        _im2col_kernel.set_target(gpu_target);
+        _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
+        CLScheduler::get().tune_kernel_static(_im2col_kernel);
+
+        // Weights reshape configuration
+        const TensorShape shape_weights_reshape(patch_size, weights_z);
+        _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
+        _weights_reshape_kernel.configure(weights, &_weights_reshaped, append_bias ? biases : nullptr);
+
+        // GEMV configuration
+        DataType    v2mm_dt        = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
+        TensorShape shape_v2mm_out = input->info()->tensor_shape();
+        shape_v2mm_out.set(0, conv_size * weights_z);
+        shape_v2mm_out.set(1, 1);
+        shape_v2mm_out.set(2, 1);
+        _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
+        _v2mm_kernel.set_target(gpu_target);
+        _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
+        CLScheduler::get().tune_kernel_static(_v2mm_kernel);
+        _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+        _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
+
+        // Output staged configuration
+        if(_is_quantized)
+        {
+            const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+
+            float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+            int   output_multiplier, output_shift;
+            quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+            _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
+            _output_reshaped.allocator()->allocate();
+        }
+
+        // Fill borders on inputs
+        PixelValue zero_in(static_cast<int32_t>(0));
+        PixelValue zero_w(static_cast<int32_t>(0));
+        if(_is_quantized)
+        {
+            zero_in = PixelValue(static_cast<int32_t>(input->info()->quantization_info().offset));
+            zero_w  = PixelValue(static_cast<int32_t>(weights->info()->quantization_info().offset));
+        }
+        BorderSize border_size = _v2mm_kernel.border_size();
+        _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, zero_in);
+
+        border_size.bottom = 0;
+        _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, zero_w);
+
+        // Allocate intermediate tensors
+        _input_reshaped.allocator()->allocate();
+        _v2mm_output.allocator()->allocate();
+
+        //Configure Activation Layer
+        _is_activationlayer_enabled = act_info.enabled();
+
+        if(_is_activationlayer_enabled)
+        {
+            _activationlayer_function.configure(output, nullptr, act_info);
+        }
     }
 }
 
@@ -204,55 +349,64 @@
 {
     const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(idx_c) * depth_multiplier) != weights->dimension(idx_c));
+    const bool can_run_optimised_3x3_kernel = (weights->dimension(idx_w) == 3) && (weights->dimension(idx_h) == 3);
 
-    const bool         is_quantized = is_data_type_quantized_asymmetric(input->data_type());
-    const bool         append_bias  = (biases != nullptr) && !is_quantized;
-    const TensorShape  output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
-    const size_t       weights_w    = weights->dimension(idx_w);
-    const size_t       weights_h    = weights->dimension(idx_h);
-    const size_t       weights_z    = weights->dimension(idx_c);
-    const unsigned int conv_w       = output_shape[idx_w];
-    const unsigned int conv_h       = output_shape[idx_h];
-    const size_t       patch_size   = weights_w * weights_h + ((append_bias) ? 1 : 0);
-    const size_t       conv_size    = conv_w * conv_h;
-
-    TensorShape shape_im2col = input->tensor_shape();
-    shape_im2col.set(0, patch_size);
-    shape_im2col.set(1, conv_size);
-    shape_im2col.set(2, weights_z);
-    TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
-
-    const TensorShape shape_weights_reshape(patch_size, weights_z);
-    TensorInfo        weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseWeightsReshapeKernel::validate(weights, &weights_reshaped, append_bias ? biases : nullptr));
-
-    DataType    v2mm_dt        = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
-    TensorShape shape_v2mm_out = input->tensor_shape();
-    shape_v2mm_out.set(0, conv_size * weights_z);
-    shape_v2mm_out.set(1, 1);
-    shape_v2mm_out.set(2, 1);
-    TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
-
-    TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output, conv_w, conv_h));
-
-    if(is_quantized)
+    if(can_run_optimised_3x3_kernel)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
-    }
+        const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
 
-    // Validate Activation Layer
-    if(act_info.enabled())
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+        ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(idx_c) * depth_multiplier) != weights->dimension(idx_c));
+
+        const bool         is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+        const bool         append_bias  = (biases != nullptr) && !is_quantized;
+        const TensorShape  output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+        const size_t       weights_w    = weights->dimension(idx_w);
+        const size_t       weights_h    = weights->dimension(idx_h);
+        const size_t       weights_z    = weights->dimension(idx_c);
+        const unsigned int conv_w       = output_shape[idx_w];
+        const unsigned int conv_h       = output_shape[idx_h];
+        const size_t       patch_size   = weights_w * weights_h + ((append_bias) ? 1 : 0);
+        const size_t       conv_size    = conv_w * conv_h;
+
+        TensorShape shape_im2col = input->tensor_shape();
+        shape_im2col.set(0, patch_size);
+        shape_im2col.set(1, conv_size);
+        shape_im2col.set(2, weights_z);
+        TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+
+        const TensorShape shape_weights_reshape(patch_size, weights_z);
+        TensorInfo        weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel::validate(weights, &weights_reshaped, append_bias ? biases : nullptr));
+
+        DataType    v2mm_dt        = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
+        TensorShape shape_v2mm_out = input->tensor_shape();
+        shape_v2mm_out.set(0, conv_size * weights_z);
+        shape_v2mm_out.set(1, 1);
+        shape_v2mm_out.set(2, 1);
+        TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
+
+        TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output, conv_w, conv_h));
+
+        if(is_quantized)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
+        }
+
+        // Validate Activation Layer
+        if(act_info.enabled())
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+        }
+    }
+    else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+        CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info);
     }
-
     return Status{};
 }
 
@@ -260,33 +414,48 @@
 {
     prepare();
 
-    CLScheduler::get().enqueue(_im2col_kernel);
-    CLScheduler::get().enqueue(_v2mm_input_fill_border);
-    CLScheduler::get().enqueue(_v2mm_kernel);
-    CLScheduler::get().enqueue(_vector_to_tensor_kernel);
-    if(_is_quantized)
+    if(_optimised_function != nullptr)
     {
-        CLScheduler::get().enqueue(_output_stage_kernel);
+        _optimised_function->run();
     }
-    if(_is_activationlayer_enabled)
+    else
     {
-        _activationlayer_function.run();
+        CLScheduler::get().enqueue(_im2col_kernel);
+        CLScheduler::get().enqueue(_v2mm_input_fill_border);
+        CLScheduler::get().enqueue(_v2mm_kernel);
+        CLScheduler::get().enqueue(_vector_to_tensor_kernel);
+        if(_is_quantized)
+        {
+            CLScheduler::get().enqueue(_output_stage_kernel);
+        }
+        if(_is_activationlayer_enabled)
+        {
+            _activationlayer_function.run();
+        }
     }
 }
 
 void CLDepthwiseConvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if(_optimised_function != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+        _optimised_function->prepare();
+    }
+    else
+    {
+        if(!_is_prepared)
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
-        // Run weights reshaping and mark original weights tensor as unused
-        _weights_reshaped.allocator()->allocate();
-        CLScheduler::get().enqueue(_weights_reshape_kernel);
-        CLScheduler::get().enqueue(_v2mm_weights_fill_border);
-        _original_weights->mark_as_unused();
+            // Run weights reshaping and mark original weights tensor as unused
+            _weights_reshaped.allocator()->allocate();
+            CLScheduler::get().enqueue(_weights_reshape_kernel);
+            CLScheduler::get().enqueue(_v2mm_weights_fill_border);
+            _original_weights->mark_as_unused();
 
-        CLScheduler::get().queue().finish();
-        _is_prepared = true;
+            CLScheduler::get().queue().finish();
+            _is_prepared = true;
+        }
     }
 }
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
new file mode 100644
index 0000000..b7e9a68
--- /dev/null
+++ b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp

@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void CLRsqrtLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+    k->configure(input, output, ElementWiseUnary::RSQRT);
+    _kernel = std::move(k);
+}
+Status CLRsqrtLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::RSQRT);
+}
+
+void CLExpLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+    k->configure(input, output, ElementWiseUnary::EXP);
+    _kernel = std::move(k);
+}
+Status CLExpLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::EXP);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp
new file mode 100644
index 0000000..28f4b13
--- /dev/null
+++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp

@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "support/ToolchainSupport.h"
+#include <arm_compute/runtime/CL/functions/CLElementwiseOperations.h>
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace
+{
+void configure_border_handler(CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output)
+{
+    if(output->info()->dimension(0) > 1)
+    {
+        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+        if(broadcasted_info->info()->dimension(0) == 1)
+        {
+            border_handler.configure(broadcasted_info, border_size, BorderMode::REPLICATE);
+        }
+    }
+}
+} // namespace
+
+void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::ADD, input1, input2, output, policy);
+    _kernel = std::move(k);
+    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+    return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, input1, input2, output, policy);
+}
+
+void CLArithmeticSubtraction::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::SUB, input1, input2, output, policy);
+    _kernel = std::move(k);
+    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_UNUSED(policy);
+    return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, input1, input2, output, policy);
+}
+
+void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::DIV, input1, input2, output);
+    _kernel = std::move(k);
+    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return CLArithmeticOperationKernel::validate(ArithmeticOperation::DIV, input1, input2, output);
+}
+
+void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::MAX, input1, input2, output);
+    _kernel = std::move(k);
+    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return CLArithmeticOperationKernel::validate(ArithmeticOperation::MAX, input1, input2, output);
+}
+
+void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::MIN, input1, input2, output);
+    _kernel = std::move(k);
+    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return CLArithmeticOperationKernel::validate(ArithmeticOperation::MIN, input1, input2, output);
+}
+
+void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+    _kernel = std::move(k);
+    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return CLArithmeticOperationKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index baa0cf4..e91038f 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,32 +33,42 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::cl_gemm;
 
 namespace
 {
-inline bool is_interleaved_transposed(int m, int n, int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+inline bool is_interleaved_transposed(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
 {
     bool flag = true;
 
     if(gpu_target_is_in(gpu_target, GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
     {
-        // COMPMID-852
-        if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
+        if((m > 1) && n < 16)
         {
-            constexpr float alpha = 3.2f;
-            constexpr float fact0 = 1.51f;
-            constexpr float fact1 = 1.66f;
-            constexpr float ops   = 12.0f;
-            const float     scale = k > 1024 ? 1.07f : 1.0f;
-            flag                  = alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops);
+            flag = true;
         }
         else
         {
-            flag = false;
+            // COMPMID-852
+            if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
+            {
+                constexpr float alpha = 3.2f;
+                constexpr float fact0 = 1.51f;
+                constexpr float fact1 = 1.66f;
+                constexpr float ops   = 12.0f;
+                const float     scale = k > 1024 ? 1.07f : 1.0f;
+                flag                  = alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops);
+            }
+            else
+            {
+                flag = false;
+            }
         }
     }
     else
@@ -73,17 +83,19 @@
 
 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
-      _interleave_kernel(),
-      _transpose_kernel(),
       _mm_kernel(),
       _ma_kernel(),
+      _reshape_lhs_kernel(),
+      _reshape_rhs_kernel(),
+      _mm_reshaped_kernel(),
       _tmp_a(),
       _tmp_b(),
       _original_b(nullptr),
       _is_interleaved_transposed(false),
       _run_addition(false),
       _reshape_b_only_on_first_run(false),
-      _is_prepared(false)
+      _is_prepared(false),
+      _is_new_gemm_reshaped(false)
 {
 }
 
@@ -106,29 +118,52 @@
     const GPUTarget gpu_target = CLScheduler::get().target();
 
     // Set the target for the kernels
-    _interleave_kernel.set_target(gpu_target);
+    _reshape_lhs_kernel.set_target(gpu_target);
     _mm_kernel.set_target(gpu_target);
 
     // Arguments used by GEMMReshapeInfo
     // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
     // in order to know how the matrices have been reshaped
-    bool      reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
-    const int m                         = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
-    const int n                         = b->info()->dimension(0);
-    const int k                         = a->info()->dimension(0);
-    const int depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
-    int       mult_transpose1xW_width   = 1;
-    int       mult_interleave4x4_height = 1;
+    DataType           data_type                 = a->info()->data_type();
+    bool               reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                         = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+    const unsigned int n                         = b->info()->dimension(0);
+    const unsigned int k                         = a->info()->dimension(0);
+    const unsigned int batch_size                = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+    const int          depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
+    int                mult_transpose1xW_width   = 1;
+    int                mult_interleave4x4_height = 1;
 
     if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
     {
         mult_transpose1xW_width   = 4;
         mult_interleave4x4_height = 2;
     }
+    GEMMRHSMatrixInfo rhs_info;
+    rhs_info.n0         = 16 / b->info()->element_size();
+    rhs_info.k0         = 1;
+    rhs_info.h0         = mult_transpose1xW_width;
+    rhs_info.interleave = false;
+    rhs_info.transpose  = false;
+
+    GEMMLHSMatrixInfo lhs_info;
+    lhs_info.m0         = 4;
+    lhs_info.k0         = 4;
+    lhs_info.v0         = mult_interleave4x4_height;
+    lhs_info.interleave = true;
+    lhs_info.transpose  = true;
 
     // Check if we need to reshape the matrix A and matrix B
     _is_interleaved_transposed = is_interleaved_transposed(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
 
+    // Check if we can run the new reshaped GEMM
+    const auto workload   = static_cast<float>((m * n) / 20.0f);
+    _is_new_gemm_reshaped = (workload > 1600.0f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && _is_interleaved_transposed && (data_type == DataType::F32);
+
+    const bool add_matrix_c  = (beta != 0.f && c != nullptr);
+    const bool is_beta_one   = std::abs(1.0f - beta) < 0.00001f;
+    const bool use_fused_add = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !_is_new_gemm_reshaped;
+
     // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
     if(_is_interleaved_transposed)
     {
@@ -145,19 +180,37 @@
         }
         // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
 
-        // Configure interleave kernel
-        _interleave_kernel.configure(a, &_tmp_a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d());
+        if(_is_new_gemm_reshaped)
+        {
+            GEMMLHSMatrixInfo lhs_info;
 
-        // Configure transpose kernel
-        _transpose_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
+            // Pick up the GEMM configuration
+            std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
+
+            _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+            _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+
+            // Configure and tune matrix multiply kernel
+            _mm_reshaped_kernel.configure(matrix_a, matrix_b, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
+                                                                                                                 depth_output_gemm3d, reinterpret_input_as_3d));
+        }
+        else
+        {
+            // Configure interleave kernel
+            _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+            // Configure transpose kernel
+            _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+        }
     }
 
-    // Configure and tune matrix multiply kernel
-    _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
-                                                                                                        mult_transpose1xW_width, mult_interleave4x4_height,
-                                                                                                        depth_output_gemm3d, reinterpret_input_as_3d),
-                         gemm_info.fp_mixed_precision());
-    CLScheduler::get().tune_kernel_static(_mm_kernel);
+    if(!_is_new_gemm_reshaped)
+    {
+        // Configure and tune matrix multiply kernel
+        _mm_kernel.configure(matrix_a, matrix_b, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta, _is_interleaved_transposed,
+                             GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d),
+                             gemm_info.fp_mixed_precision());
+        CLScheduler::get().tune_kernel_static(_mm_kernel);
+    }
 
     if(_is_interleaved_transposed)
     {
@@ -170,7 +223,7 @@
     }
 
     // Configure matrix addition kernel
-    if(beta != 0 && c != nullptr)
+    if(add_matrix_c && !use_fused_add)
     {
         _ma_kernel.configure(c, output, beta);
         _run_addition = true;
@@ -197,13 +250,15 @@
     // Arguments used by GEMMReshapeInfo
     // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
     // in order to know how the matrices have been reshaped
-    bool      reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
-    const int m                         = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const int n                         = b->dimension(0);
-    const int k                         = a->dimension(0);
-    int       mult_transpose1xW_width   = 1;
-    int       mult_interleave4x4_height = 1;
-    const int depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
+    DataType           data_type                 = a->data_type();
+    bool               reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                         = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n                         = b->dimension(0);
+    const unsigned int k                         = a->dimension(0);
+    const unsigned int batch_size                = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    int                mult_transpose1xW_width   = 1;
+    int                mult_interleave4x4_height = 1;
+    const int          depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
 
     if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
     {
@@ -211,9 +266,31 @@
         mult_interleave4x4_height = 2;
     }
 
+    GEMMRHSMatrixInfo rhs_info;
+    rhs_info.n0         = 16 / b->element_size();
+    rhs_info.k0         = 1;
+    rhs_info.h0         = mult_transpose1xW_width;
+    rhs_info.interleave = false;
+    rhs_info.transpose  = false;
+
+    GEMMLHSMatrixInfo lhs_info;
+    lhs_info.m0         = 4;
+    lhs_info.k0         = 4;
+    lhs_info.v0         = mult_interleave4x4_height;
+    lhs_info.interleave = true;
+    lhs_info.transpose  = true;
+
     // Check if we need to reshape the matrix A and matrix B
     const bool run_interleave_transpose = is_interleaved_transposed(m, n, k, a->data_type(), reshape_b_only_on_first_run, gpu_target);
 
+    // Check if we can run the new reshaped GEMM
+    const auto workload             = static_cast<float>((m * n) / 20.0f);
+    const bool is_new_gemm_reshaped = (workload > 1600.f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && run_interleave_transpose && (data_type == DataType::F32);
+
+    const bool add_matrix_c  = (beta != 0.f && c != nullptr);
+    const bool is_beta_one   = std::abs(1.0f - beta) < 0.00001f;
+    const bool use_fused_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1) && !is_new_gemm_reshaped;
+
     // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
     if(run_interleave_transpose)
     {
@@ -227,19 +304,42 @@
         matrix_a_info = &tmp_a_info;
         matrix_b_info = &tmp_b_info;
 
-        // Validate interleave kernel
-        auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &tmp_a_info, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()));
+        if(is_new_gemm_reshaped)
+        {
+            GEMMLHSMatrixInfo lhs_info;
 
-        // Validate transpose kernel
-        auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &tmp_b_info, mult_transpose1xW_width));
+            // Pick up the GEMM configuration
+            std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
+
+            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+
+            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+            // Validate matrix multiply
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
+                                                                                     depth_output_gemm3d, reinterpret_input_as_3d)));
+        }
+        else
+        {
+            // Validate interleave kernel
+            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+            // Validate transpose kernel
+            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+        }
     }
 
-    // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, alpha, run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
+    if(!is_new_gemm_reshaped)
+    {
+        // Validate matrix multiply
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta,
+                                                                         run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
+    }
 
-    if(beta != 0 && c != nullptr)
+    if(add_matrix_c && !use_fused_add)
     {
         // Validate matrix addition kernel
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
@@ -257,17 +357,24 @@
     if(_is_interleaved_transposed)
     {
         // Run interleave kernel
-        CLScheduler::get().enqueue(_interleave_kernel, false);
+        CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
 
         if(!_reshape_b_only_on_first_run)
         {
             // Run transpose kernel
-            CLScheduler::get().enqueue(_transpose_kernel, false);
+            CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
         }
     }
 
     // Run matrix multiply kernel
-    CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+    if(_is_new_gemm_reshaped)
+    {
+        CLScheduler::get().enqueue(_mm_reshaped_kernel, !_run_addition);
+    }
+    else
+    {
+        CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+    }
 
     // Run matrix addition kernel
     if(_run_addition)
@@ -286,10 +393,11 @@
         {
             // Run transpose kernel and mark original weights tensor as unused
             _tmp_b.allocator()->allocate();
-            CLScheduler::get().enqueue(_transpose_kernel, false);
+            CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
             _original_b->mark_as_unused();
         }
         CLScheduler::get().queue().finish();
         _is_prepared = true;
     }
 }
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 4694aa7..7105e85 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -93,7 +93,7 @@
 CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _add_bias_kernel(),
       _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _data_layout(DataLayout::NCHW), _append_bias(false), _skip_im2col(false), _skip_col2im(false), _is_quantized(false),
-      _is_activationlayer_enabled(false), _is_prepared(false)
+      _is_activationlayer_enabled(false), _is_prepared(false), _run_addition(true)
 {
 }
 
@@ -101,7 +101,8 @@
                                           int gemm_3d_depth)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col,
+                                           _run_addition));
 
     const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
                                          gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
@@ -125,13 +126,15 @@
     }
     else
     {
+        // Bias does not need to be added in GEMM if im2col is being used or the Matrix Addition kernel needs to be run
+        const bool skip_bias_in_gemm = _run_addition || !_skip_im2col;
         // Configure matrix multiply function
-        _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
+        _mm_gemm.configure(input, weights, (skip_bias_in_gemm) ? nullptr : biases, output, 1.0f, 1.0f, gemm_info);
     }
 }
 
 Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                           const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col)
+                                           const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, bool run_addition)
 {
     const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
 
@@ -156,8 +159,10 @@
     }
     else
     {
+        // Bias does not need to be added in GEMM if im2col is being used or the Matrix Addition kernel needs to be run
+        const bool skip_bias_in_gemm = run_addition || !skip_im2col;
         // Perform validation step on Matrix multiply function
-        return CLGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
+        return CLGEMM::validate(input, weights, (skip_bias_in_gemm) ? nullptr : biases, output, 1.0f, 1.0f, gemm_info);
     }
 }
 
@@ -193,6 +198,8 @@
     _skip_col2im                = data_layout == DataLayout::NHWC;
     _append_bias                = (biases != nullptr) && (!_is_quantized);
     _is_activationlayer_enabled = act_info.enabled();
+    // In case of F16, fused bias will be used in GEMM
+    _run_addition = (_skip_im2col) && (_append_bias) && (data_type != DataType::F16);
 
     // Set the GPU target for im2col and col2im
     _im2col_kernel.set_target(CLScheduler::get().target());
@@ -242,7 +249,7 @@
     else if(_append_bias)
     {
         // Configure add bias kernel
-        _add_bias_kernel.configure(output, biases, output, ConvertPolicy::SATURATE);
+        _add_bias_kernel.configure(ArithmeticOperation::ADD, output, biases, output, ConvertPolicy::SATURATE);
     }
 
     // Create GEMM output tensor
@@ -276,9 +283,9 @@
     {
         const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
 
-        const float multiplier  = (input->info()->quantization_info().scale * weights->info()->quantization_info().scale) / output_quant_info.scale;
-        int   output_multiplier = 0;
-        int   output_shift      = 0;
+        const float multiplier        = (input->info()->quantization_info().scale * weights->info()->quantization_info().scale) / output_quant_info.scale;
+        int         output_multiplier = 0;
+        int         output_shift      = 0;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
 
         int min_activation = 0;
@@ -375,6 +382,8 @@
     const bool skip_im2col                = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
     const bool skip_col2im                = data_layout == DataLayout::NHWC;
     bool       is_activationlayer_enabled = act_info.enabled();
+    // In case of F16, fused bias will be used in GEMM
+    const bool run_addition = (skip_im2col) && (append_bias) && (data_type != DataType::F16);
 
     ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * num_groups) != input->dimension(idx_channel));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
@@ -429,10 +438,10 @@
         ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, kernel_dims, conv_info, append_bias, dilation, num_groups));
         gemm_input_to_use = &im2col_reshaped_info;
     }
-    else if(append_bias)
+    else if(run_addition)
     {
         // Validate add bias kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(output, biases, output, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, output, biases, output, ConvertPolicy::SATURATE));
     }
 
     // Create GEMM output tensor
@@ -459,9 +468,9 @@
     {
         const QuantizationInfo output_quant_info = (output->total_size() == 0) ? input->quantization_info() : output->quantization_info();
 
-        const float multiplier  = (input->quantization_info().scale * weights->quantization_info().scale) / output_quant_info.scale;
-        int   output_multiplier = 0;
-        int   output_shift      = 0;
+        const float multiplier        = (input->quantization_info().scale * weights->quantization_info().scale) / output_quant_info.scale;
+        int         output_multiplier = 0;
+        int         output_shift      = 0;
 
         ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
 
@@ -496,7 +505,7 @@
     // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, run_addition));
 
     // Validate Col2Im
     if(!skip_col2im)
@@ -537,7 +546,7 @@
         _mm_gemm.run();
     }
 
-    if(_skip_im2col && _append_bias)
+    if(_run_addition)
     {
         CLScheduler::get().enqueue(_add_bias_kernel);
     }

diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 2d4d231..2a01db7 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,42 +31,25 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::cl_gemm;
 
 namespace
 {
-inline bool is_interleaved_transposed(int m, int n, int k, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+inline bool is_gemm_reshaped(unsigned int m, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
 {
-    bool flag = true;
-
-    if(gpu_target_is_in(gpu_target,
-                        GPUTarget::G71, GPUTarget::G72,
-                        GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT))
-    {
-        // COMPMID-852
-        if(k > 256 && m > 4 && reshape_b_only_on_first_run)
-        {
-            flag = ((0.72f + n * 0.10766f) < (n * 0.1284f));
-        }
-        else
-        {
-            flag = false;
-        }
-    }
-    else
-    {
-        flag = m > 1;
-    }
-
-    return flag;
+    return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (m > 1) && (reshape_b_only_on_first_run);
 }
 } // namespace
 
 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
       _mm_kernel(),
+      _mm_reshaped_kernel(),
       _mtx_a_reshape_kernel(),
       _mtx_b_reshape_kernel(),
       _mtx_a_reduction_kernel(),
@@ -81,7 +64,7 @@
       _original_b(nullptr),
       _a_offset(0),
       _b_offset(0),
-      _is_interleaved_transposed(true),
+      _is_gemm_reshaped(true),
       _reshape_b_only_on_first_run(false),
       _is_prepared(false),
       _fuse_output_stage(false)
@@ -108,23 +91,23 @@
 
     const ICLTensor *matrix_a = a;
     const ICLTensor *matrix_b = b;
+    GEMMRHSMatrixInfo rhs_info;
+    GEMMLHSMatrixInfo lhs_info;
 
     // Arguments used by GEMMReshapeInfo
     // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
     // in order to know how the matrices have been reshaped
-    bool          reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
-    const bool    unroll_block              = dot8_supported(CLKernelLibrary::get().get_device());
-    const int     m                         = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
-    const int     n                         = b->info()->dimension(0);
-    const int     k                         = a->info()->dimension(0);
-    const int     depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
-    constexpr int mult_transpose1xW_width   = 1;
-    constexpr int mult_interleave4x4_height = 1;
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+    const unsigned int n                       = b->info()->dimension(0);
+    const unsigned int k                       = a->info()->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
 
     // Check if we need to reshape the matrix A and matrix B
-    _is_interleaved_transposed = is_interleaved_transposed(m, n, k, _reshape_b_only_on_first_run, gpu_target);
+    _is_gemm_reshaped = is_gemm_reshaped(m, _reshape_b_only_on_first_run, gpu_target);
 
-    if(_is_interleaved_transposed)
+    if(_is_gemm_reshaped)
     {
         // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
         reinterpret_input_as_3d = false;
@@ -138,11 +121,14 @@
             _memory_group.manage(&_tmp_b);
         }
 
+        // Pick up the GEMM configuration
+        std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+
         // Configure interleave kernel
-        _mtx_a_reshape_kernel.configure(a, &_tmp_a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d(), unroll_block);
+        _mtx_a_reshape_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
 
         // Configure transpose kernel
-        _mtx_b_reshape_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
+        _mtx_b_reshape_kernel.configure(b, &_tmp_b, rhs_info);
     }
 
     // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
@@ -177,10 +163,16 @@
 
         _memory_group.manage(&_mm_result_s32);
 
-        // Configure matrix multiply kernel
-        _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
-                                                                                                              mult_transpose1xW_width, mult_interleave4x4_height,
-                                                                                                              depth_output_gemm3d, reinterpret_input_as_3d));
+        if(_is_gemm_reshaped)
+        {
+            // Configure and tune matrix multiply kernel
+            _mm_reshaped_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+        }
+        else
+        {
+            // Configure matrix multiply kernel
+            _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, false, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+        }
 
         // Configure offset contribution kernel
         _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
@@ -190,17 +182,23 @@
     }
     else
     {
-        // Configure matrix multiply kernel
-        _mm_kernel.configure(matrix_a, matrix_b, output, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
-                                                                                                     mult_transpose1xW_width, mult_interleave4x4_height,
-                                                                                                     depth_output_gemm3d, reinterpret_input_as_3d));
+        if(_is_gemm_reshaped)
+        {
+            // Configure and tune matrix multiply kernel
+            _mm_reshaped_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+        }
+        else
+        {
+            // Configure matrix multiply kernel
+            _mm_kernel.configure(matrix_a, matrix_b, output, false, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+        }
 
         // Configure offset contribution kernel
         _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset, _b_offset);
     }
 
     // Allocate tensors
-    if(_is_interleaved_transposed)
+    if(_is_gemm_reshaped)
     {
         _tmp_a.allocator()->allocate();
         if(!_reshape_b_only_on_first_run)
@@ -233,18 +231,19 @@
     const ITensorInfo *matrix_a_info = a;
     const ITensorInfo *matrix_b_info = b;
 
-    TensorInfo tmp_a_info{};
-    TensorInfo tmp_b_info{};
+    TensorInfo        tmp_a_info{};
+    TensorInfo        tmp_b_info{};
+    GEMMRHSMatrixInfo rhs_info;
+    GEMMLHSMatrixInfo lhs_info;
 
-    bool          reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
-    const int     m                         = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const int     n                         = b->dimension(0);
-    const int     k                         = a->dimension(0);
-    constexpr int mult_transpose1xW_width   = 1;
-    constexpr int mult_interleave4x4_height = 1;
-    const int     depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n                       = b->dimension(0);
+    const unsigned int k                       = a->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
 
-    bool reshape_matrices = is_interleaved_transposed(m, n, k, gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
+    bool reshape_matrices = is_gemm_reshaped(m, gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
 
     // if reshape_matrices is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
     if(reshape_matrices)
@@ -252,20 +251,24 @@
         reinterpret_input_as_3d = false;
     }
 
-    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d);
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
 
     if(reshape_matrices)
     {
         matrix_a_info = &tmp_a_info;
         matrix_b_info = &tmp_b_info;
 
+        // Pick up the GEMM configuration
+        std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+
         // Validate interleave kernel
-        auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &tmp_a_info, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()));
+        auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
 
         // Validate transpose kernel
-        auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &tmp_b_info, mult_transpose1xW_width));
+
+        auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
     }
 
     TensorInfo info_vector_sum_col, info_vector_sum_row;
@@ -292,12 +295,22 @@
     {
         TensorInfo mm_result_s32_info{};
 
-        // Output tensor auto inizialitation if not yet initialized
-        auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_matrices, reshape_info)).set_data_type(DataType::S32));
+        if(reshape_matrices)
+        {
+            // Output tensor auto inizialitation if not yet initialized
+            auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
 
-        // Validate matrix multiply
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, reshape_matrices, reshape_info));
+            // Validate matrix multiply
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
+        }
+        else
+        {
+            // Output tensor auto inizialitation if not yet initialized
+            auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
 
+            // Validate matrix multiply
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, false, reshape_info));
+        }
         // Validate offset contribution kernel
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
                                                                                             a_offset == 0 ? nullptr : &info_vector_sum_col,
@@ -309,9 +322,16 @@
     }
     else
     {
-        // Validate matrix multiply
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, reshape_matrices, reshape_info));
-
+        if(reshape_matrices)
+        {
+            // Validate matrix multiply
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
+        }
+        else
+        {
+            // Validate matrix multiply
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, false, reshape_info));
+        }
         // Validate offset contribution kernel
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
                                                                                  a_offset == 0 ? nullptr : &info_vector_sum_col,
@@ -329,7 +349,7 @@
 
     _memory_group.acquire();
 
-    if(_is_interleaved_transposed)
+    if(_is_gemm_reshaped)
     {
         // Run reshape matrix A
         CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false);
@@ -348,7 +368,14 @@
     }
 
     // Run matrix multiply
-    CLScheduler::get().enqueue(_mm_kernel, false);
+    if(_is_gemm_reshaped)
+    {
+        CLScheduler::get().enqueue(_mm_reshaped_kernel, false);
+    }
+    else
+    {
+        CLScheduler::get().enqueue(_mm_kernel, false);
+    }
 
     // Run matrix A reduction kernel only if _b_offset is not equal to 0
     if(_b_offset != 0)
@@ -374,7 +401,7 @@
 {
     if(!_is_prepared)
     {
-        if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+        if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
         {
             ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
 
@@ -395,3 +422,4 @@
         _is_prepared = true;
     }
 }
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp
new file mode 100644
index 0000000..459438e
--- /dev/null
+++ b/src/runtime/CL/functions/CLGather.cpp

@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGather.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLGather::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLGatherKernel>();
+    k->configure(input, indices, output, axis);
+    _kernel = std::move(k);
+}
+
+Status CLGather::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+{
+    return CLGatherKernel::validate(input, indices, output, axis);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index 5dd1202..c50132e 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,6 +40,7 @@
       _memset_kernel(),
       _padded_copy_kernel(),
       _cpp_nms_kernel(),
+      _is_nhwc(false),
       _deltas_permuted(),
       _deltas_flattened(),
       _scores_permuted(),
@@ -60,10 +61,11 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
     ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
 
+    _is_nhwc                         = scores->info()->data_layout() == DataLayout::NHWC;
     const DataType data_type         = deltas->info()->data_type();
-    const int      num_anchors       = scores->info()->dimension(2);
-    const int      feat_width        = scores->info()->dimension(0);
-    const int      feat_height       = scores->info()->dimension(1);
+    const int      num_anchors       = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
+    const int      feat_width        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
+    const int      feat_height       = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
     const int      total_num_anchors = num_anchors * feat_width * feat_height;
     const int      pre_nms_topN      = info.pre_nms_topN();
     const int      post_nms_topN     = info.post_nms_topN();
@@ -77,21 +79,37 @@
     _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, data_type));
 
     // Permute and reshape deltas
-    _memory_group.manage(&_deltas_permuted);
-    _memory_group.manage(&_deltas_flattened);
-    _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
-    _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
-    _deltas_permuted.allocator()->allocate();
+    if(!_is_nhwc)
+    {
+        _memory_group.manage(&_deltas_permuted);
+        _memory_group.manage(&_deltas_flattened);
+        _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+        _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
+        _deltas_permuted.allocator()->allocate();
+    }
+    else
+    {
+        _memory_group.manage(&_deltas_flattened);
+        _flatten_deltas_kernel.configure(deltas, &_deltas_flattened);
+    }
 
     const TensorShape flatten_shape_scores(1, total_num_anchors);
     _scores_flattened.allocator()->init(TensorInfo(flatten_shape_scores, 1, data_type));
 
     // Permute and reshape scores
-    _memory_group.manage(&_scores_permuted);
-    _memory_group.manage(&_scores_flattened);
-    _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
-    _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
-    _scores_permuted.allocator()->allocate();
+    if(!_is_nhwc)
+    {
+        _memory_group.manage(&_scores_permuted);
+        _memory_group.manage(&_scores_flattened);
+        _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+        _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
+        _scores_permuted.allocator()->allocate();
+    }
+    else
+    {
+        _memory_group.manage(&_scores_flattened);
+        _flatten_scores_kernel.configure(scores, &_scores_flattened);
+    }
 
     // Bounding box transform
     _memory_group.manage(&_all_proposals);
@@ -141,11 +159,12 @@
                                           const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(scores, DataLayout::NCHW);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(scores, DataLayout::NCHW, DataLayout::NHWC);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas);
 
-    const int num_anchors       = scores->dimension(2);
-    const int feat_width        = scores->dimension(0);
-    const int feat_height       = scores->dimension(1);
+    const int num_anchors       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
+    const int feat_width        = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
+    const int feat_height       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
     const int num_images        = scores->dimension(3);
     const int total_num_anchors = num_anchors * feat_width * feat_height;
     const int values_per_roi    = info.values_per_roi();
@@ -156,14 +175,21 @@
     ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
 
     TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
+    TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
+    if(scores->data_layout() == DataLayout::NHWC)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+    }
 
     TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
     ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&deltas_permuted_info, &deltas_flattened_info));
 
-    TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
-
     TensorInfo scores_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
     TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
 
@@ -236,9 +262,12 @@
     CLScheduler::get().enqueue(_compute_anchors_kernel, false);
 
     // Transpose and reshape the inputs
-    CLScheduler::get().enqueue(_permute_deltas_kernel, false);
+    if(!_is_nhwc)
+    {
+        CLScheduler::get().enqueue(_permute_deltas_kernel, false);
+        CLScheduler::get().enqueue(_permute_scores_kernel, false);
+    }
     CLScheduler::get().enqueue(_flatten_deltas_kernel, false);
-    CLScheduler::get().enqueue(_permute_scores_kernel, false);
     CLScheduler::get().enqueue(_flatten_scores_kernel, false);
 
     // Build the boxes

diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 4f709d5..2e3c6d7 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp

@@ -32,8 +32,8 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 CLL2NormalizeLayer::CLL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
 {
@@ -81,3 +81,4 @@
 
     _memory_group.release();
 }
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index a89c4e3..f01b1b8 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -110,9 +110,9 @@
     _gemm_forget_gate.configure(output_state_in, &_forget_gate_out2, nullptr, &_forget_gate_out3, 1.f, 0.f);
     _forget_gate_out2.allocator()->allocate();
     _memory_group.manage(&_forget_gate_out5);
-    _accum_forget_gate1.configure(&_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
+    _accum_forget_gate1.configure(ArithmeticOperation::ADD, &_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
+    _forget_gate_out1.allocator()->allocate();
     CLTensor *forget_gate_out = &_forget_gate_out5;
-
     if(lstm_params.has_peephole_opt())
     {
         _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
@@ -129,17 +129,18 @@
     {
         _forget_gate_out3.allocator()->allocate();
     }
-    _activation_forget_gate.configure(forget_gate_out, &_forget_gate_out1, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the input gate
     // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
     // input_gate = 1 - forget_gate, with CIFG
     _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    CLTensor *input_gate_out = &_input_gate_out1;
     if(lstm_params.has_cifg_opt())
     {
         _memory_group.manage(&_input_gate_out1);
         _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-        _subtract_input_gate.configure(&_ones, &_forget_gate_out1, &_input_gate_out1, ConvertPolicy::SATURATE);
+        _subtract_input_gate.configure(ArithmeticOperation::SUB, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
         _ones.allocator()->allocate();
         _run_cifg_opt = true;
     }
@@ -160,17 +161,23 @@
         _gemm_input_gate.configure(output_state_in, &_input_gate_out2, nullptr, &_input_gate_out3, 1.f, 0.f);
         _input_gate_out2.allocator()->allocate();
         _memory_group.manage(&_input_gate_out4);
-        _accum_input_gate1.configure(&_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
+        _accum_input_gate1.configure(ArithmeticOperation::ADD, &_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
+        _input_gate_out3.allocator()->allocate();
+        input_gate_out = &_input_gate_out4;
         if(_run_peephole_opt)
         {
             _memory_group.manage(&_input_gate_out5);
             _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
             _accum_input_gate2.configure(&_input_gate_out4, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _input_gate_out4.allocator()->allocate();
             _input_gate_out5.allocator()->allocate();
+            input_gate_out = &_input_gate_out1;
         }
-        _input_gate_out3.allocator()->allocate();
-        _input_gate_out4.allocator()->allocate();
-        _activation_input_gate.configure(&_input_gate_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        else
+        {
+            _input_gate_out1.allocator()->allocate();
+        }
+        _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     }
 
     // Configure block that calculates the cell state
@@ -190,14 +197,13 @@
     _gemm_cell_state1.configure(output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
     _cell_state_out2.allocator()->allocate();
     _memory_group.manage(&_cell_state_out4);
-    _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
+    _accum_cell_state1.configure(ArithmeticOperation::ADD, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
     _activation_cell_state.configure(&_cell_state_out4, nullptr, activation_info);
     _memory_group.manage(&_cell_state_out5);
-    _pixelwise_mul_cell_state1.configure(&_cell_state_out4, &_input_gate_out1, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _pixelwise_mul_cell_state1.configure(&_cell_state_out4, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
     _cell_state_out4.allocator()->allocate();
-    _pixelwise_mul_cell_state2.configure(&_forget_gate_out1, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-    _forget_gate_out1.allocator()->allocate();
-    _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
+    _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _accum_cell_state2.configure(ArithmeticOperation::ADD, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
     _cell_state_out3.allocator()->allocate();
     _cell_state_out5.allocator()->allocate();
     // Perform clipping
@@ -223,7 +229,7 @@
     _gemm_output.configure(output_state_in, &_output2, nullptr, &_output3, 1.f, 0.f);
     _output2.allocator()->allocate();
     _memory_group.manage(&_output5);
-    _accum_output1.configure(&_output1, &_output3, &_output5, ConvertPolicy::SATURATE);
+    _accum_output1.configure(ArithmeticOperation::ADD, &_output1, &_output3, &_output5, ConvertPolicy::SATURATE);
     _output3.allocator()->allocate();
     CLTensor *output_gate_out = &_output5;
     if(lstm_params.has_peephole_opt())
@@ -284,13 +290,13 @@
     std::vector<ICLTensor *> scratch_inputs;
     if(!lstm_params.has_cifg_opt())
     {
-        scratch_inputs.emplace_back(&_input_gate_out1);
+        scratch_inputs.emplace_back(input_gate_out);
     }
     scratch_inputs.emplace_back(&_cell_state_out1);
     scratch_inputs.emplace_back(forget_gate_out);
     scratch_inputs.emplace_back(output_gate_out);
     _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
-    _input_gate_out1.allocator()->allocate();
+    input_gate_out->allocator()->allocate();
     _cell_state_out1.allocator()->allocate();
     forget_gate_out->allocator()->allocate();
     output_gate_out->allocator()->allocate();
@@ -364,7 +370,7 @@
     // Validate forget gate
     ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, &forget_gate));
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &forget_gate, 1.f, 0.f, GEMMInfo()));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     if(lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
@@ -396,7 +402,7 @@
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtractionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, &forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
 
     // Validate cell state
@@ -544,4 +550,4 @@
     _concat_scratch_buffer.run();
 
     _memory_group.release();
-}
\ No newline at end of file
+}

diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
index 7e5278f..559b57f 100644
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,8 +28,8 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
 #include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
 #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
 #include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
 #include "support/ToolchainSupport.h"

diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
index 32d8f15..8489fab 100644
--- a/src/runtime/CL/functions/CLNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,7 +45,7 @@
     _norm_kernel.configure(input, output, norm_info);
 
     // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
-    _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+    _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue());
 }
 
 Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)

diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index de43c7d..3aa1b1e 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,21 +34,21 @@
 {
 }
 
-void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding)
+void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value)
 {
     // Copy the input to the output
     _copy_kernel.configure(input, output, padding);
 
     // Set the pages of the output to zero
-    _memset_kernel.configure(output, PixelValue());
+    _memset_kernel.configure(output, constant_value);
 
     // Fill padding on the first two dimensions with zeros
-    _fillborder_kernel.configure(input, input->info()->padding(), BorderMode::CONSTANT);
+    _fillborder_kernel.configure(input, input->info()->padding(), BorderMode::CONSTANT, constant_value);
 }
 
-Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding)
+Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(input, PixelValue()));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(input, constant_value));
     ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, padding));
 
     return Status{};

diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 1809e6e..63f00ac 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp

@@ -60,7 +60,7 @@
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
     ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info));
 
     return Status{};
@@ -90,7 +90,7 @@
     _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
     _memory_group.manage(&_add_output);
 
-    _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
+    _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
 
     _fully_connected_out.allocator()->allocate();
     _gemm_output.allocator()->allocate();
@@ -127,4 +127,4 @@
 
         _is_prepared = true;
     }
-}
\ No newline at end of file
+}

diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
index 0f480ee..7bb4178 100644
--- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,7 +30,7 @@
 
 using namespace arm_compute;
 
-void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLROIArray *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
 {
     // Configure ROI pooling kernel
     auto k = arm_compute::support::cpp14::make_unique<CLROIPoolingLayerKernel>();

diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp
new file mode 100644
index 0000000..b2cd472
--- /dev/null
+++ b/src/runtime/CL/functions/CLRange.cpp

@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLRange.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLRangeKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLRange::configure(ICLTensor *output, const float start, const float end, const float step)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLRangeKernel>();
+    k->set_target(CLScheduler::get().target());
+    k->configure(output, start, end, step);
+    _kernel = std::move(k);
+
+    // Tune kernels
+    CLScheduler::get().tune_kernel_static(*_kernel);
+}
+
+Status CLRange::validate(const ITensorInfo *output, const float start, const float end, const float step)
+{
+    return CLRangeKernel::validate(output, start, end, step);
+}

diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index 1016ff7..b2d0f81 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,22 +45,31 @@
     _reduced_outs      = arm_compute::support::cpp14::make_unique<CLTensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
     _keep_dims         = keep_dims;
 
+    Coordinates axis_local = reduction_axis;
+    const int   input_dims = input->info()->num_dimensions();
+
+    // Convert negative axis
+    for(unsigned int i = 0; i < _reduction_ops; ++i)
+    {
+        axis_local[i] = wrap_around(axis_local[i], input_dims);
+    }
+
     // Perform reduction for every axis
     for(unsigned int i = 0; i < _reduction_ops; ++i)
     {
         TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
-        out_shape.set(reduction_axis[i], 1);
+        out_shape.set(axis_local[i], 1);
         auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
 
         if(i == _reduction_ops - 1 && keep_dims)
         {
-            _reduction_kernels[i].configure(in, output, reduction_axis[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
         }
         else
         {
             _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
             _memory_group.manage(_reduced_outs.get() + i);
-            _reduction_kernels[i].configure(in, _reduced_outs.get() + i, reduction_axis[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i], ReductionOperation::MEAN_SUM);
         }
     }
 
@@ -77,11 +86,10 @@
 
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
-        Coordinates axis_copy = reduction_axis;
-        std::sort(axis_copy.begin(), axis_copy.begin() + _reduction_ops);
+        std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
         for(unsigned int i = 0; i < _reduction_ops; ++i)
         {
-            out_shape.remove_dimension(axis_copy[i] - i);
+            out_shape.remove_dimension(axis_local[i] - i);
         }
         auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
         _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
@@ -90,22 +98,43 @@
 
 Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
 {
-    ARM_COMPUTE_UNUSED(keep_dims);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
 
-    for(unsigned int i = 0; i < reduction_axis.num_dimensions(); ++i)
+    TensorShape out_shape = input->tensor_shape();
+
+    Coordinates        axis_sorted   = reduction_axis;
+    const unsigned int reduction_ops = reduction_axis.num_dimensions();
+    const int          input_dims    = input->num_dimensions();
+
+    // Convert negative axis
+    for(unsigned int i = 0; i < reduction_ops; ++i)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis[i] > 3);
-        ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(reduction_axis[i]) > input->num_dimensions() - 1);
+        axis_sorted[i] = wrap_around(axis_sorted[i], input_dims);
+    }
+
+    std::sort(axis_sorted.begin(), axis_sorted.begin() + reduction_ops);
+    for(unsigned int i = 0; i < reduction_ops; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(axis_sorted[i] > 3);
+        ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_sorted[i]) > input->num_dimensions() - 1);
         if(output->total_size() > 0 && keep_dims)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(reduction_axis[i]) != 1);
+            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_sorted[i]) != 1);
         }
-
-        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, output, reduction_axis[i], ReductionOperation::MEAN_SUM));
+        if(keep_dims)
+        {
+            out_shape.set(axis_sorted[i], 1);
+        }
+        else
+        {
+            out_shape.remove_dimension(axis_sorted[i] - i);
+        }
     }
 
+    const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
     return Status{};
 }
 

diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index c5447ff..3d82e3f 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,15 +56,19 @@
 } // namespace
 
 CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_quantized()
+    : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_serial()
 {
 }
 
 Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
 {
     const unsigned int num_of_stages = calculate_number_of_stages(input, axis);
-
-    if(axis == 0 && !is_data_type_quantized(input->data_type()))
+    bool               is_serial     = is_data_type_quantized(input->data_type()) || axis != 0;
+    if(is_serial)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
+    }
+    else
     {
         // Create temporary tensor infos
         auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
@@ -81,17 +85,25 @@
         }
 
         ReductionOperation first_kernel_op;
+        ReductionOperation intermediate_kernel_op;
         ReductionOperation last_kernel_op;
         switch(op)
         {
             case ReductionOperation::SUM:
             case ReductionOperation::MEAN_SUM:
-                first_kernel_op = ReductionOperation::SUM;
-                last_kernel_op  = op;
+                first_kernel_op        = ReductionOperation::SUM;
+                intermediate_kernel_op = ReductionOperation::SUM;
+                last_kernel_op         = op;
                 break;
             case ReductionOperation::SUM_SQUARE:
-                first_kernel_op = ReductionOperation::SUM_SQUARE;
-                last_kernel_op  = ReductionOperation::SUM;
+                first_kernel_op        = ReductionOperation::SUM_SQUARE;
+                intermediate_kernel_op = ReductionOperation::SUM;
+                last_kernel_op         = ReductionOperation::SUM;
+                break;
+            case ReductionOperation::PROD:
+                first_kernel_op        = ReductionOperation::PROD;
+                intermediate_kernel_op = ReductionOperation::PROD;
+                last_kernel_op         = ReductionOperation::PROD;
                 break;
             default:
                 ARM_COMPUTE_ERROR("Not supported");
@@ -103,17 +115,13 @@
         // Validate ReductionOperation on intermediate stages
         for(unsigned int i = 1; i < num_of_stages - 1; ++i)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, ReductionOperation::SUM));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, intermediate_kernel_op));
         }
 
         // Validate ReductionOperation on the last stage
         const unsigned int last_stage = num_of_stages - 1;
         ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input->dimension(0)));
     }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
-    }
 
     return Status{};
 }
@@ -122,65 +130,77 @@
 {
     _num_of_stages  = calculate_number_of_stages(input->info(), axis);
     _reduction_axis = axis;
-    _is_quantized   = is_data_type_quantized(input->info()->data_type());
+    _is_serial      = is_data_type_quantized(input->info()->data_type()) || axis != 0;
 
     // Configure reduction operation kernels
     _reduction_kernels_vector = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel[]>(_num_of_stages);
 
     // Create temporary tensors
-    if(axis == 0 && !_is_quantized)
+    if(_is_serial)
+    {
+        _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
+    }
+    else
     {
         _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
-        _sums_vector            = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
+        _results_vector         = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
         TensorShape shape{ input->info()->tensor_shape() };
         for(unsigned int i = 0; i < _num_of_stages - 1; i++)
         {
             shape.set(0, ceil(shape.x() / 128.f));
-            _sums_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
+            _results_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
         }
 
         // Apply ReductionOperation only on first kernel
-        _memory_group.manage(_sums_vector.get());
+        _memory_group.manage(_results_vector.get());
 
         ReductionOperation first_kernel_op;
+        ReductionOperation intermediate_kernel_op;
         ReductionOperation last_kernel_op;
+        PixelValue         pixelValue;
         switch(op)
         {
             case ReductionOperation::SUM:
             case ReductionOperation::MEAN_SUM:
-                first_kernel_op = ReductionOperation::SUM;
-                last_kernel_op  = op;
+                first_kernel_op        = ReductionOperation::SUM;
+                intermediate_kernel_op = ReductionOperation::SUM;
+                last_kernel_op         = op;
+                pixelValue             = PixelValue();
                 break;
             case ReductionOperation::SUM_SQUARE:
-                first_kernel_op = ReductionOperation::SUM_SQUARE;
-                last_kernel_op  = ReductionOperation::SUM;
+                first_kernel_op        = ReductionOperation::SUM_SQUARE;
+                intermediate_kernel_op = ReductionOperation::SUM;
+                last_kernel_op         = ReductionOperation::SUM;
+                pixelValue             = PixelValue();
+                break;
+            case ReductionOperation::PROD:
+                first_kernel_op        = ReductionOperation::PROD;
+                intermediate_kernel_op = ReductionOperation::PROD;
+                last_kernel_op         = ReductionOperation::PROD;
+                pixelValue             = PixelValue(1, input->info()->data_type());
                 break;
             default:
                 ARM_COMPUTE_ERROR("Not supported");
         }
 
-        _reduction_kernels_vector[0].configure(input, _sums_vector.get(), axis, first_kernel_op);
-        _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, PixelValue(0));
+        _reduction_kernels_vector[0].configure(input, _results_vector.get(), axis, first_kernel_op);
+        _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
 
         // Apply ReductionOperation on intermediate stages
         for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
         {
-            _memory_group.manage(_sums_vector.get() + i);
-            _reduction_kernels_vector[i].configure(_sums_vector.get() + i - 1, _sums_vector.get() + i, axis, ReductionOperation::SUM);
-            _border_handlers_vector[i].configure(_sums_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
-            _sums_vector[i - 1].allocator()->allocate();
+            _memory_group.manage(_results_vector.get() + i);
+            _reduction_kernels_vector[i].configure(_results_vector.get() + i - 1, _results_vector.get() + i, axis, intermediate_kernel_op);
+            _border_handlers_vector[i].configure(_results_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
+            _results_vector[i - 1].allocator()->allocate();
         }
 
         // Apply ReductionOperation on the last stage
         const unsigned int last_stage  = _num_of_stages - 1;
         const unsigned int input_width = input->info()->dimension(0);
-        _reduction_kernels_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
-        _border_handlers_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, PixelValue(0));
-        _sums_vector[last_stage - 1].allocator()->allocate();
-    }
-    else
-    {
-        _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
+        _reduction_kernels_vector[last_stage].configure(_results_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
+        _border_handlers_vector[last_stage].configure(_results_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
+        _results_vector[last_stage - 1].allocator()->allocate();
     }
 }
 
@@ -188,7 +208,11 @@
 {
     _memory_group.acquire();
 
-    if(_reduction_axis == 0 && !_is_quantized)
+    if(_is_serial)
+    {
+        CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
+    }
+    else
     {
         for(unsigned int i = 0; i < _num_of_stages; ++i)
         {
@@ -196,10 +220,6 @@
             CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
         }
     }
-    else
-    {
-        CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
-    }
 
     _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp
new file mode 100644
index 0000000..0f86b9f
--- /dev/null
+++ b/src/runtime/CL/functions/CLReverse.cpp

@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLReverse.h"
+
+#include "arm_compute/core/CL/kernels/CLReverseKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLReverseKernel>();
+    k->configure(input, output, axis);
+    _kernel = std::move(k);
+}
+
+Status CLReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+{
+    return CLReverseKernel::validate(input, output, axis);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp
new file mode 100644
index 0000000..90c368e
--- /dev/null
+++ b/src/runtime/CL/functions/CLSelect.cpp

@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSelect.h"
+
+#include "arm_compute/core/CL/kernels/CLSelectKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+void CLSelect::configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLSelectKernel>();
+    k->configure(c, x, y, output);
+    _kernel = std::move(k);
+}
+
+Status CLSelect::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+{
+    return CLSelectKernel::validate(c, x, y, output);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp
index bef7eca..f630853 100644
--- a/src/runtime/CL/functions/CLSlice.cpp
+++ b/src/runtime/CL/functions/CLSlice.cpp

@@ -36,10 +36,10 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
     // Get absolute end coordinates
-    const Coordinates ends_abs = arm_compute::helpers::tensor_transform::slice_absolute_end_coords(input->info()->tensor_shape(), ends);
+    const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
 
     auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
-    k->configure(input, output, starts, ends_abs, BiStrides(), 0, 0, 0);
+    k->configure(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
     _kernel = std::move(k);
 }
 
@@ -54,8 +54,8 @@
     }));
 
     // Get absolute end coordinates
-    const Coordinates ends_abs = arm_compute::helpers::tensor_transform::slice_absolute_end_coords(input->tensor_shape(), ends);
+    const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
 
-    return CLStridedSliceKernel::validate(input, output, starts, ends_abs, BiStrides(), 0, 0, 0);
+    return CLStridedSliceKernel::validate(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
 }
 } // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
index 76c1e18..a24b72e 100644
--- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp

@@ -33,20 +33,19 @@
 namespace arm_compute
 {
 CLSpaceToBatchLayer::CLSpaceToBatchLayer()
-    : _space_to_batch_kernel(), _output(nullptr), _has_padding(false)
+    : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
 {
 }
 
 void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
 
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
+        _memset_kernel.configure(output, PixelValue());
     }
-
-    _output = output;
     _space_to_batch_kernel.configure(input, block_shape, paddings, output);
 }
 
@@ -57,42 +56,35 @@
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
+        _memset_kernel.configure(output, PixelValue());
     }
-
-    _output = output;
     _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
 }
 
 Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
 {
-    return CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue()));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
+
+    return Status{};
 }
 
 Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
                                      const ITensorInfo *output)
 {
-    return CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue()));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+
+    return Status{};
 }
 
 void CLSpaceToBatchLayer::run()
 {
     // Zero out output only if we have paddings
-    // TODO(micspy01): replace with memset once ready
     if(_has_padding)
     {
-        _output->map(CLScheduler::get().queue(), true);
-        if(is_data_type_quantized_asymmetric(_output->info()->data_type()))
-        {
-            const uint8_t quantized_zero = _output->info()->quantization_info().offset;
-            std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
-        }
-        else
-        {
-            memset(_output->buffer(), 0, _output->info()->total_size());
-        }
-        _output->unmap(CLScheduler::get().queue());
+        CLScheduler::get().enqueue(_memset_kernel, true);
     }
-
     CLScheduler::get().enqueue(_space_to_batch_kernel, true);
 }
 } // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp
new file mode 100644
index 0000000..71327fe
--- /dev/null
+++ b/src/runtime/CL/functions/CLStackLayer.cpp

@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <complex>
+
+#include "arm_compute/runtime/CL/functions/CLStackLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLStackLayer::CLStackLayer() // NOLINT
+    : _input(),
+      _stack_kernels(),
+      _num_inputs(0)
+{
+}
+
+void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
+{
+    _num_inputs    = input.size();
+    _stack_kernels = arm_compute::support::cpp14::make_unique<CLStackLayerKernel[]>(_num_inputs);
+
+    // Wrap around negative values
+    const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
+
+    for(unsigned int i = 0; i < _num_inputs; i++)
+    {
+        _stack_kernels[i].configure(input[i], axis_u, i, _num_inputs, output);
+    }
+}
+
+Status CLStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input.empty());
+
+    // Wrap around negative values
+    const size_t       rank   = input[0]->num_dimensions();
+    const unsigned int axis_u = wrap_around(axis, static_cast<int>(rank + 1));
+
+    const unsigned int num_inputs = input.size();
+
+    for(unsigned int i = 0; i < num_inputs; i++)
+    {
+        // All the tensors must have the same rank
+        ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
+        // Validate Kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLStackLayerKernel::validate(input[i], axis_u, i, num_inputs, output));
+    }
+
+    return Status{};
+}
+
+void CLStackLayer::run()
+{
+    for(unsigned i = 0; i < _num_inputs; i++)
+    {
+        CLScheduler::get().enqueue(_stack_kernels[i], false);
+    }
+}

diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp
new file mode 100644
index 0000000..ec6a4ab
--- /dev/null
+++ b/src/runtime/CL/functions/CLTile.cpp

@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLTile.h"
+
+#include "arm_compute/core/CL/kernels/CLTileKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLTile::configure(const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLTileKernel>();
+    k->configure(input, output, multiples);
+    _kernel = std::move(k);
+}
+
+Status CLTile::validate(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples)
+{
+    return CLTileKernel::validate(input, output, multiples);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp
new file mode 100644
index 0000000..428d091
--- /dev/null
+++ b/src/runtime/CL/functions/CLUnstack.cpp

@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLUnstack.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor)
+{
+    return wrap_around(axis, static_cast<int>(tensor->num_dimensions()));
+}
+
+inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions)
+{
+    // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time.
+    Coordinates slice_end;
+    slice_start.set_num_dimensions(input_num_dimensions);
+    slice_end.set_num_dimensions(input_num_dimensions);
+    for(size_t k = 0; k < input_num_dimensions; ++k)
+    {
+        slice_start.set(k, 0);
+        slice_end.set(k, -1);
+    }
+    slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(slice_end);
+}
+} // namespace
+
+CLUnstack::CLUnstack() // NOLINT
+    : _num_slices(0),
+      _strided_slice_vector()
+{
+}
+
+void CLUnstack::configure(const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis)
+{
+    std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
+    std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ICLTensor * t)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+        return t->info();
+    });
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_ERROR_THROW_ON(CLUnstack::validate(input->info(), outputs_vector_info, axis));
+
+    // Wrap around negative values
+    const unsigned int axis_u = wrap_axis(axis, input->info());
+    _num_slices               = std::min(outputs_vector_info.size(), input->info()->dimension(axis_u));
+    _strided_slice_vector     = arm_compute::support::cpp14::make_unique<CLStridedSlice[]>(_num_slices);
+
+    Coordinates slice_start;
+    int32_t     slice_end_mask;
+    setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions());
+    for(unsigned int slice = 0; slice < _num_slices; ++slice)
+    {
+        // Adjusts start and end coordinates to take a 2D slice at a time
+        slice_start.set(axis_u, slice);
+        _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+    }
+}
+
+Status CLUnstack::validate(const ITensorInfo *input, const std::vector<ITensorInfo *> &output_vector, int axis)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_vector.empty());
+    ARM_COMPUTE_RETURN_ERROR_ON(axis < (-static_cast<int>(input->tensor_shape().num_dimensions())));
+    ARM_COMPUTE_RETURN_ERROR_ON(axis >= static_cast<int>(input->tensor_shape().num_dimensions()));
+    const unsigned int num_slices = std::min(output_vector.size(), input->dimension(wrap_axis(axis, input)));
+    ARM_COMPUTE_RETURN_ERROR_ON(num_slices > input->dimension(wrap_axis(axis, input)));
+    ARM_COMPUTE_RETURN_ERROR_ON(num_slices > output_vector.size());
+    Coordinates slice_start;
+    int32_t     slice_end_mask;
+    for(size_t k = 0; k < num_slices; ++k)
+    {
+        slice_start.set(wrap_axis(axis, input), k);
+        setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions());
+        ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input))));
+    }
+    return Status{};
+}
+
+void CLUnstack::run()
+{
+    for(unsigned i = 0; i < _num_slices; ++i)
+    {
+        _strided_slice_vector[i].run();
+    }
+}
+
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
index 46a2d80..d0801a6 100644
--- a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,8 +50,8 @@
     ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
 
     // Output auto inizialitation if not yet initialized
-    TensorInfo  tmp_output_info = *output->clone();
-    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+    TensorInfo        tmp_output_info = *output->clone();
+    const TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
     auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
 
     switch(num_inputs)
@@ -90,7 +90,7 @@
     {
         inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
     }
-    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());

diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 1abcb67..069196e 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp

@@ -84,8 +84,8 @@
 } // namespace
 
 CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _activationlayer_function(), _input0(), _input1(), _batched_mm_output(),
-      _original_weights(nullptr), _is_prepared(false), _is_activationlayer_enabled(false)
+    : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _input0(), _input1(), _batched_mm_output(), _original_weights(nullptr),
+      _is_prepared(false)
 {
 }
 
@@ -133,14 +133,7 @@
                                                                                                  (input->info()->data_type() == DataType::F16)));
 
     // Configure output transform
-    _output_transform.configure(&_batched_mm_output, biases, output, winograd_info);
-
-    // Configure activation layer
-    _is_activationlayer_enabled = act_info.enabled();
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.configure(output, nullptr, act_info);
-    }
+    _output_transform.configure(&_batched_mm_output, biases, output, winograd_info, act_info);
 
     // Allocate temporary tensors
     _input0.allocator()->allocate();
@@ -216,11 +209,6 @@
     // Run output transform
     CLScheduler::get().enqueue(_output_transform);
 
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.run();
-    }
-
     _memory_group.release();
 }
 

diff --git a/src/runtime/CL/functions/CLWinogradInputTransform.cpp b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
index 09e8456..7361eb2 100644
--- a/src/runtime/CL/functions/CLWinogradInputTransform.cpp
+++ b/src/runtime/CL/functions/CLWinogradInputTransform.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,7 +35,7 @@
     auto k = arm_compute::support::cpp14::make_unique<CLWinogradInputTransformKernel>();
     k->configure(input, output, winograd_info);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
 }
 
 Status CLWinogradInputTransform::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)

diff --git a/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp b/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp
new file mode 100644
index 0000000..cd97849
--- /dev/null
+++ b/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp

@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.h"
+
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+namespace
+{
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_gemm_reshaped(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
+                                                                        bool lhs_interleave, bool rhs_interleave)
+{
+    GEMMLHSMatrixInfo lhs_info;
+    GEMMRHSMatrixInfo rhs_info;
+
+    // Configure GEMMLHSMatrixInfo
+    lhs_info.m0         = m0;
+    lhs_info.k0         = k0;
+    lhs_info.v0         = ((m / (lhs_info.m0 * v0)) == 0) ? 1 : v0;
+    lhs_info.interleave = lhs_interleave;
+    lhs_info.transpose  = false;
+
+    // Configure GEMMRHSMatrixInfo
+    rhs_info.n0         = n0;
+    rhs_info.k0         = lhs_info.k0;
+    rhs_info.h0         = ((n / (rhs_info.n0 * h0)) == 0) ? 1 : h0;
+    rhs_info.interleave = rhs_interleave;
+    rhs_info.transpose  = true;
+
+    return std::make_pair(lhs_info, rhs_info);
+}
+
+} // namespace
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+    ARM_COMPUTE_ERROR_ON(data_type != DataType::F32 && data_type != DataType::QASYMM8);
+    ARM_COMPUTE_UNUSED(data_type);
+
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (CLGEMMReshapedConfigurationBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+    // Configurations for Mali-G76
+    static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_reshaped_configs_G76 =
+    {
+        { DataType::F32, &CLGEMMReshapedConfigurationBifrost::configure_G76_f32 },
+        { DataType::QASYMM8, &CLGEMMReshapedConfigurationBifrost::configure_G76_u8 }
+    };
+
+    // Configurations for Mali-G7x
+    static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_reshaped_configs_G7x =
+    {
+        { DataType::F32, &CLGEMMReshapedConfigurationBifrost::configure_G7x_f32 },
+        { DataType::QASYMM8, &CLGEMMReshapedConfigurationBifrost::configure_G7x_u8 }
+    };
+
+    switch(gpu_target)
+    {
+        case GPUTarget::G76:
+            return (this->*gemm_reshaped_configs_G76[data_type])(m, n, k, b);
+        default:
+            return (this->*gemm_reshaped_configs_G7x[data_type])(m, n, k, b);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(n <= 4)
+    {
+        return configure_gemm_reshaped(m, n, 4, 2, 8, 16, 16, true, false);
+    }
+    else
+    {
+        return configure_gemm_reshaped(m, n, 5, 4, 4, 2, 16, false, true);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(dot8_supported(CLKernelLibrary::get().get_device()))
+    {
+        if(n <= 4)
+        {
+            return configure_gemm_reshaped(m, n, 4, 2, 16, 2, 2, true, false);
+        }
+        else
+        {
+            return configure_gemm_reshaped(m, n, 4, 4, 16, 2, 2, true, false);
+        }
+    }
+    else
+    {
+        if(n <= 4)
+        {
+            return configure_gemm_reshaped(m, n, 4, 2, 8, 2, 2, true, false);
+        }
+        else
+        {
+            return configure_gemm_reshaped(m, n, 6, 4, 4, 2, 2, true, true);
+        }
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(n <= 4)
+    {
+        return configure_gemm_reshaped(m, n, 4, 2, 8, 16, 16, true, false);
+    }
+    else
+    {
+        return configure_gemm_reshaped(m, n, 4, 4, 2, 8, 16, false, false);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(n <= 4)
+    {
+        return configure_gemm_reshaped(m, n, 4, 2, 16, 4, 1, false, false);
+    }
+    else
+    {
+        return configure_gemm_reshaped(m, n, 4, 4, 16, 2, 2, false, true);
+    }
+}
+} // namespace cl_gemm
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 2b179fd..5916bb4 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp

@@ -190,15 +190,19 @@
             return;
         }
 
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
         try
         {
+#endif /* ARM_COMPUTE_EXCEPTIONS_ENABLED */
             process_workloads(*_workloads, *_feeder, _info);
+
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
         }
         catch(...)
         {
             _current_exception = std::current_exception();
         }
-
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
         _job_complete = true;
         lock.unlock();
         _cv.notify_one();
@@ -250,18 +254,21 @@
 
     info.thread_id = t;
     process_workloads(workloads, feeder, info);
-
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     try
     {
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
         for(auto &thread : _threads)
         {
             thread.wait();
         }
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
     catch(const std::system_error &e)
     {
         std::cerr << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
     }
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
 }
 #endif /* DOXYGEN_SKIP_THIS */
 

diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
new file mode 100644
index 0000000..79e619c
--- /dev/null
+++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp

@@ -0,0 +1,682 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
+
+#include <list>
+
+namespace arm_compute
+{
+namespace
+{
+Status detection_layer_validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_loc, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, input_conf, input_priorbox);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_loc->num_dimensions() > 2, "The location input tensor should be [C1, N].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_conf->num_dimensions() > 2, "The location input tensor should be [C2, N].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_priorbox->num_dimensions() > 3, "The priorbox input tensor should be [C3, 2, N].");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.eta() <= 0.f && info.eta() > 1.f, "Eta should be between 0 and 1");
+
+    const int num_priors = input_priorbox->tensor_shape()[0] / 4;
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_loc_classes() * 4)) != input_loc->tensor_shape()[0], "Number of priors must match number of location predictions.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_classes())) != input_conf->tensor_shape()[0], "Number of priors must match number of confidence predictions.");
+
+    // Validate configured output
+    if(output->total_size() != 0)
+    {
+        const unsigned int max_size = info.keep_top_k() * (input_loc->num_dimensions() > 1 ? input_loc->dimension(1) : 1);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), TensorShape(7U, max_size));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, output);
+    }
+
+    return Status{};
+}
+
+/** Function used to sort pair<float, T> in descend order based on the score (first) value.
+ */
+template <typename T>
+bool SortScorePairDescend(const std::pair<float, T> &pair1,
+                          const std::pair<float, T> &pair2)
+{
+    return pair1.first > pair2.first;
+}
+
+/** Get location predictions from input_loc.
+ *
+ * @param[in]  input_loc                The input location prediction.
+ * @param[in]  num                      The number of images.
+ * @param[in]  num_priors               number of predictions per class.
+ * @param[in]  num_loc_classes          number of location classes. It is 1 if share_location is true,
+ *                                      and is equal to number of classes needed to predict otherwise.
+ * @param[in]  share_location           If true, all classes share the same location prediction.
+ * @param[out] all_location_predictions All the location predictions.
+ *
+ */
+void retrieve_all_loc_predictions(const ITensor *input_loc, const int num,
+                                  const int num_priors, const int num_loc_classes,
+                                  const bool share_location, std::vector<LabelBBox> &all_location_predictions)
+{
+    for(int i = 0; i < num; ++i)
+    {
+        for(int c = 0; c < num_loc_classes; ++c)
+        {
+            int label = share_location ? -1 : c;
+            if(all_location_predictions[i].find(label) == all_location_predictions[i].end())
+            {
+                all_location_predictions[i][label].resize(num_priors);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR_ON(all_location_predictions[i][label].size() != static_cast<size_t>(num_priors));
+                break;
+            }
+        }
+    }
+    for(int i = 0; i < num; ++i)
+    {
+        for(int p = 0; p < num_priors; ++p)
+        {
+            for(int c = 0; c < num_loc_classes; ++c)
+            {
+                const int label    = share_location ? -1 : c;
+                const int base_ptr = i * num_priors * num_loc_classes * 4 + p * num_loc_classes * 4 + c * 4;
+                //xmin, ymin, xmax, ymax
+                all_location_predictions[i][label][p][0] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr)));
+                all_location_predictions[i][label][p][1] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 1)));
+                all_location_predictions[i][label][p][2] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 2)));
+                all_location_predictions[i][label][p][3] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 3)));
+            }
+        }
+    }
+}
+
+/** Get confidence predictions from input_conf.
+ *
+ * @param[in]  input_loc                The input location prediction.
+ * @param[in]  num                      The number of images.
+ * @param[in]  num_priors               Number of predictions per class.
+ * @param[in]  num_loc_classes          Number of location classes. It is 1 if share_location is true,
+ *                                      and is equal to number of classes needed to predict otherwise.
+ * @param[out] all_location_predictions All the location predictions.
+ *
+ */
+void retrieve_all_conf_scores(const ITensor *input_conf, const int num,
+                              const int num_priors, const int                 num_classes,
+                              std::vector<std::map<int, std::vector<float>>> &all_confidence_scores)
+{
+    std::vector<float> tmp_buffer;
+    tmp_buffer.resize(num * num_priors * num_classes);
+    for(int i = 0; i < num; ++i)
+    {
+        for(int c = 0; c < num_classes; ++c)
+        {
+            for(int p = 0; p < num_priors; ++p)
+            {
+                tmp_buffer[i * num_classes * num_priors + c * num_priors + p] =
+                    *reinterpret_cast<float *>(input_conf->ptr_to_element(Coordinates(i * num_classes * num_priors + p * num_classes + c)));
+            }
+        }
+    }
+    for(int i = 0; i < num; ++i)
+    {
+        for(int c = 0; c < num_classes; ++c)
+        {
+            all_confidence_scores[i][c].resize(num_priors);
+            all_confidence_scores[i][c].assign(&tmp_buffer[i * num_classes * num_priors + c * num_priors],
+                                               &tmp_buffer[i * num_classes * num_priors + c * num_priors + num_priors]);
+        }
+    }
+}
+
+/** Get prior boxes from input_priorbox.
+ *
+ * @param[in]  input_priorbox           The input location prediction.
+ * @param[in]  num_priors               Number of priors.
+ * @param[in]  num_loc_classes          number of location classes. It is 1 if share_location is true,
+ *                                      and is equal to number of classes needed to predict otherwise.
+ * @param[out] all_prior_bboxes         If true, all classes share the same location prediction.
+ * @param[out] all_location_predictions All the location predictions.
+ *
+ */
+void retrieve_all_priorbox(const ITensor               *input_priorbox,
+                           const int                    num_priors,
+                           std::vector<NormalizedBBox> &all_prior_bboxes,
+                           std::vector<std::array<float, 4>> &all_prior_variances)
+{
+    for(int i = 0; i < num_priors; ++i)
+    {
+        all_prior_bboxes[i] =
+        {
+            {
+                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4))),
+                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 1))),
+                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 2))),
+                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 3)))
+            }
+        };
+    }
+
+    std::array<float, 4> var({ { 0, 0, 0, 0 } });
+    for(int i = 0; i < num_priors; ++i)
+    {
+        for(int j = 0; j < 4; ++j)
+        {
+            var[j] = *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates((num_priors + i) * 4 + j)));
+        }
+        all_prior_variances[i] = var;
+    }
+}
+
+/** Decode a bbox according to a prior bbox.
+ *
+ * @param[in]  prior_bbox                 The input prior bounding boxes.
+ * @param[in]  prior_variance             The corresponding input variance.
+ * @param[in]  code_type                  The detection output code type used to decode the results.
+ * @param[in]  variance_encoded_in_target If true, the variance is encoded in target.
+ * @param[in]  clip_bbox                  If true, the results should be between 0.f and 1.f.
+ * @param[in]  bbox                       The input bbox to decode
+ * @param[out] decode_bbox                The decoded bboxes.
+ *
+ */
+void DecodeBBox(const NormalizedBBox &prior_bbox, const std::array<float, 4> &prior_variance,
+                const DetectionOutputLayerCodeType code_type, const bool variance_encoded_in_target,
+                const bool clip_bbox, const NormalizedBBox &bbox, NormalizedBBox &decode_bbox)
+{
+    // if the variance is encoded in target, we simply need to add the offset predictions
+    // otherwise we need to scale the offset accordingly.
+    switch(code_type)
+    {
+        case DetectionOutputLayerCodeType::CORNER:
+        {
+            decode_bbox[0] = prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]);
+            decode_bbox[1] = prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]);
+            decode_bbox[2] = prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]);
+            decode_bbox[3] = prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]);
+
+            break;
+        }
+        case DetectionOutputLayerCodeType::CENTER_SIZE:
+        {
+            const float prior_width  = prior_bbox[2] - prior_bbox[0];
+            const float prior_height = prior_bbox[3] - prior_bbox[1];
+
+            // Check if the prior width and height are right
+            ARM_COMPUTE_ERROR_ON(prior_width <= 0.f);
+            ARM_COMPUTE_ERROR_ON(prior_height <= 0.f);
+
+            const float prior_center_x = (prior_bbox[0] + prior_bbox[2]) / 2.;
+            const float prior_center_y = (prior_bbox[1] + prior_bbox[3]) / 2.;
+
+            const float decode_bbox_center_x = (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width + prior_center_x;
+            const float decode_bbox_center_y = (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height + prior_center_y;
+            const float decode_bbox_width    = (variance_encoded_in_target ? std::exp(bbox[2]) : std::exp(prior_variance[2] * bbox[2])) * prior_width;
+            const float decode_bbox_height   = (variance_encoded_in_target ? std::exp(bbox[3]) : std::exp(prior_variance[3] * bbox[3])) * prior_height;
+
+            decode_bbox[0] = (decode_bbox_center_x - decode_bbox_width / 2.f);
+            decode_bbox[1] = (decode_bbox_center_y - decode_bbox_height / 2.f);
+            decode_bbox[2] = (decode_bbox_center_x + decode_bbox_width / 2.f);
+            decode_bbox[3] = (decode_bbox_center_y + decode_bbox_height / 2.f);
+
+            break;
+        }
+        case DetectionOutputLayerCodeType::CORNER_SIZE:
+        {
+            const float prior_width  = prior_bbox[2] - prior_bbox[0];
+            const float prior_height = prior_bbox[3] - prior_bbox[1];
+
+            // Check if the prior width and height are greater than 0
+            ARM_COMPUTE_ERROR_ON(prior_width <= 0.f);
+            ARM_COMPUTE_ERROR_ON(prior_height <= 0.f);
+
+            decode_bbox[0] = prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width;
+            decode_bbox[1] = prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height;
+            decode_bbox[2] = prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]) * prior_width;
+            decode_bbox[3] = prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]) * prior_height;
+
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Unsupported Detection Output Code Type.");
+    }
+
+    if(clip_bbox)
+    {
+        for(auto &d_bbox : decode_bbox)
+        {
+            d_bbox = utility::clamp(d_bbox, 0.f, 1.f);
+        }
+    }
+}
+
+/** Do non maximum suppression given bboxes and scores.
+ *
+ * @param[in]  bboxes          The input bounding boxes.
+ * @param[in]  scores          The corresponding input confidence.
+ * @param[in]  score_threshold The threshold used to filter detection results.
+ * @param[in]  nms_threshold   The threshold used in non maximum suppression.
+ * @param[in]  eta             Adaptation rate for nms threshold.
+ * @param[in]  top_k           If not -1, keep at most top_k picked indices.
+ * @param[out] indices         The kept indices of bboxes after nms.
+ *
+ */
+void ApplyNMSFast(const std::vector<NormalizedBBox> &bboxes,
+                  const std::vector<float> &scores, const float score_threshold,
+                  const float nms_threshold, const float eta, const int top_k,
+                  std::vector<int> &indices)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(bboxes.size() != scores.size(), "bboxes and scores have different size.");
+
+    // Get top_k scores (with corresponding indices).
+    std::list<std::pair<float, int>> score_index_vec;
+
+    // Generate index score pairs.
+    for(size_t i = 0; i < scores.size(); ++i)
+    {
+        if(scores[i] > score_threshold)
+        {
+            score_index_vec.emplace_back(std::make_pair(scores[i], i));
+        }
+    }
+
+    // Sort the score pair according to the scores in descending order
+    score_index_vec.sort(SortScorePairDescend<int>);
+
+    // Keep top_k scores if needed.
+    const int score_index_vec_size = score_index_vec.size();
+    if(top_k > -1 && top_k < score_index_vec_size)
+    {
+        score_index_vec.resize(top_k);
+    }
+
+    // Do nms.
+    float adaptive_threshold = nms_threshold;
+    indices.clear();
+
+    while(!score_index_vec.empty())
+    {
+        const int idx  = score_index_vec.front().second;
+        bool      keep = true;
+        for(int kept_idx : indices)
+        {
+            if(keep)
+            {
+                // Compute the jaccard (intersection over union IoU) overlap between two bboxes.
+                NormalizedBBox intersect_bbox = std::array<float, 4>({ { 0, 0, 0, 0 } });
+                if(bboxes[kept_idx][0] > bboxes[idx][2] || bboxes[kept_idx][2] < bboxes[idx][0] || bboxes[kept_idx][1] > bboxes[idx][3] || bboxes[kept_idx][3] < bboxes[idx][1])
+                {
+                    intersect_bbox = std::array<float, 4>({ { 0, 0, 0, 0 } });
+                }
+                else
+                {
+                    intersect_bbox = std::array<float, 4>({ {
+                            std::max(bboxes[idx][0], bboxes[kept_idx][0]),
+                            std::max(bboxes[idx][1], bboxes[kept_idx][1]),
+                            std::min(bboxes[idx][2], bboxes[kept_idx][2]),
+                            std::min(bboxes[idx][3], bboxes[kept_idx][3])
+                        }
+                    });
+                }
+
+                float intersect_width  = intersect_bbox[2] - intersect_bbox[0];
+                float intersect_height = intersect_bbox[3] - intersect_bbox[1];
+
+                float overlap = 0.f;
+                if(intersect_width > 0 && intersect_height > 0)
+                {
+                    float intersect_size = intersect_width * intersect_height;
+                    float bbox1_size     = (bboxes[idx][2] < bboxes[idx][0]
+                                            || bboxes[idx][3] < bboxes[idx][1]) ?
+                                           0.f :
+                                           (bboxes[idx][2] - bboxes[idx][0]) * (bboxes[idx][3] - bboxes[idx][1]); //BBoxSize(bboxes[idx]);
+                    float bbox2_size = (bboxes[kept_idx][2] < bboxes[kept_idx][0]
+                                        || bboxes[kept_idx][3] < bboxes[kept_idx][1]) ?
+                                       0.f :
+                                       (bboxes[kept_idx][2] - bboxes[kept_idx][0]) * (bboxes[kept_idx][3] - bboxes[kept_idx][1]); // BBoxSize(bboxes[kept_idx]);
+                    overlap = intersect_size / (bbox1_size + bbox2_size - intersect_size);
+                }
+                keep = (overlap <= adaptive_threshold);
+            }
+            else
+            {
+                break;
+            }
+        }
+        if(keep)
+        {
+            indices.push_back(idx);
+        }
+        score_index_vec.erase(score_index_vec.begin());
+        if(keep && eta < 1.f && adaptive_threshold > 0.5f)
+        {
+            adaptive_threshold *= eta;
+        }
+    }
+}
+
+Status non_max_suppression_validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
+                                              const float score_threshold, const float nms_threshold)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(bboxes, scores, indices);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bboxes, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bboxes->num_dimensions() > 2, "The bboxes tensor must be a 2-D float tensor of shape [4, num_boxes].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1, "The scores tensor must be a 1-D float tensor of shape [num_boxes].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(indices->num_dimensions() > 1, "The indices must be 1-D integer tensor of shape [M], where max_output_size <= M");
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bboxes, scores);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1, "Scores must be a 1D float tensor");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(indices->dimension(0) == 0, "Indices tensor must be bigger than 0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(max_output_size == 0, "Max size cannot be 0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(nms_threshold < 0.f || nms_threshold > 1.f, "Threshould must be in [0,1]");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(score_threshold < 0.f || score_threshold > 1.f, "Threshould must be in [0,1]");
+
+    return Status{};
+}
+} // namespace
+
+CPPNonMaximumSuppression::CPPNonMaximumSuppression()
+    : _bboxes(nullptr), _scores(nullptr), _indices(nullptr), _max_output_size(0), _score_threshold(0.f), _nms_threshold(0.f)
+{
+}
+
+void CPPNonMaximumSuppression::configure(
+    const ITensor *bboxes, const ITensor *scores, ITensor *indices, unsigned int max_output_size,
+    const float score_threshold, const float nms_threshold)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(bboxes, scores, indices);
+    ARM_COMPUTE_ERROR_THROW_ON(non_max_suppression_validate_arguments(bboxes->info(), scores->info(), indices->info(), max_output_size, score_threshold, nms_threshold));
+
+    // copy scores also to a vector
+    _bboxes  = bboxes;
+    _scores  = scores;
+    _indices = indices;
+
+    _nms_threshold   = nms_threshold;
+    _max_output_size = max_output_size;
+    _score_threshold = score_threshold;
+}
+
+Status CPPNonMaximumSuppression::validate(
+    const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
+    const float score_threshold, const float nms_threshold)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(non_max_suppression_validate_arguments(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold));
+    return Status{};
+}
+
+void extract_bounding_boxes_from_tensor(const ITensor *bboxes, std::vector<NormalizedBBox> &bboxes_vector)
+{
+    Window input_win;
+    input_win.use_tensor_dimensions(bboxes->info()->tensor_shape());
+    input_win.set_dimension_step(0U, 4U);
+    input_win.set_dimension_step(1U, 1U);
+    Iterator input(bboxes, input_win);
+    auto     f = [&bboxes_vector, &input](const Coordinates &)
+    {
+        const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
+        bboxes_vector.push_back(NormalizedBBox({ { *input_ptr, *(input_ptr + 1), *(2 + input_ptr), *(3 + input_ptr) } }));
+    };
+    execute_window_loop(input_win, f, input);
+}
+
+void extract_scores_from_tensor(const ITensor *scores, std::vector<float> &scores_vector)
+{
+    Window window;
+    window.use_tensor_dimensions(scores->info()->tensor_shape());
+    Iterator it(scores, window);
+    auto     f = [&it, &scores_vector](const Coordinates &)
+    {
+        const auto input_ptr = reinterpret_cast<const float *>(it.ptr());
+        scores_vector.push_back(*input_ptr);
+    };
+    execute_window_loop(window, f, it);
+}
+
+void CPPNonMaximumSuppression::run()
+{
+    std::vector<NormalizedBBox> bboxes_vector;
+    std::vector<float>          scores_vector;
+    std::vector<int>            indices_vector;
+    extract_bounding_boxes_from_tensor(_bboxes, bboxes_vector);
+    extract_scores_from_tensor(_scores, scores_vector);
+    ApplyNMSFast(bboxes_vector, scores_vector, _score_threshold, _nms_threshold, 1, -1 /* disable top_k */, indices_vector);
+    std::copy_n(indices_vector.begin(), std::min(indices_vector.size(), _indices->info()->dimension(0)), reinterpret_cast<int *>(_indices->ptr_to_element(Coordinates(0))));
+}
+
+CPPDetectionOutputLayer::CPPDetectionOutputLayer()
+    : _input_loc(nullptr), _input_conf(nullptr), _input_priorbox(nullptr), _output(nullptr), _info(), _num_priors(), _num(), _all_location_predictions(), _all_confidence_scores(), _all_prior_bboxes(),
+      _all_prior_variances(), _all_decode_bboxes(), _all_indices()
+{
+}
+
+void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor *input_conf, const ITensor *input_priorbox, ITensor *output, DetectionOutputLayerInfo info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output);
+    // Output auto initialization if not yet initialized
+    // Since the number of bboxes to kept is unknown before nms, the shape is set to the maximum
+    // The maximum is keep_top_k * input_loc_size[1]
+    // Each row is a 7 dimension std::vector, which stores [image_id, label, confidence, xmin, ymin, xmax, ymax]
+    const unsigned int max_size = info.keep_top_k() * (input_loc->info()->num_dimensions() > 1 ? input_loc->info()->dimension(1) : 1);
+    auto_init_if_empty(*output->info(), input_loc->info()->clone()->set_tensor_shape(TensorShape(7U, max_size)));
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(detection_layer_validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info));
+
+    _input_loc      = input_loc;
+    _input_conf     = input_conf;
+    _input_priorbox = input_priorbox;
+    _output         = output;
+    _info           = info;
+    _num_priors     = input_priorbox->info()->dimension(0) / 4;
+    _num            = (_input_loc->info()->num_dimensions() > 1 ? _input_loc->info()->dimension(1) : 1);
+
+    _all_location_predictions.resize(_num);
+    _all_confidence_scores.resize(_num);
+    _all_prior_bboxes.resize(_num_priors);
+    _all_prior_variances.resize(_num_priors);
+    _all_decode_bboxes.resize(_num);
+
+    for(int i = 0; i < _num; ++i)
+    {
+        for(int c = 0; c < _info.num_loc_classes(); ++c)
+        {
+            const int label = _info.share_location() ? -1 : c;
+            if(label == _info.background_label_id())
+            {
+                // Ignore background class.
+                continue;
+            }
+            _all_decode_bboxes[i][label].resize(_num_priors);
+        }
+    }
+    _all_indices.resize(_num);
+
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+}
+
+Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(detection_layer_validate_arguments(input_loc, input_conf, input_priorbox, output, info));
+    return Status{};
+}
+
+void CPPDetectionOutputLayer::run()
+{
+    // Retrieve all location predictions.
+    retrieve_all_loc_predictions(_input_loc, _num, _num_priors, _info.num_loc_classes(), _info.share_location(), _all_location_predictions);
+
+    // Retrieve all confidences.
+    retrieve_all_conf_scores(_input_conf, _num, _num_priors, _info.num_classes(), _all_confidence_scores);
+
+    // Retrieve all prior bboxes.
+    retrieve_all_priorbox(_input_priorbox, _num_priors, _all_prior_bboxes, _all_prior_variances);
+
+    // Decode all loc predictions to bboxes
+    const bool clip_bbox = false;
+    for(int i = 0; i < _num; ++i)
+    {
+        for(int c = 0; c < _info.num_loc_classes(); ++c)
+        {
+            const int label = _info.share_location() ? -1 : c;
+            if(label == _info.background_label_id())
+            {
+                // Ignore background class.
+                continue;
+            }
+            ARM_COMPUTE_ERROR_ON_MSG(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(), "Could not find location predictions for label %d.", label);
+
+            const std::vector<NormalizedBBox> &label_loc_preds = _all_location_predictions[i].find(label)->second;
+
+            const int num_bboxes = _all_prior_bboxes.size();
+            ARM_COMPUTE_ERROR_ON(_all_prior_variances[i].size() != 4);
+
+            for(int j = 0; j < num_bboxes; ++j)
+            {
+                DecodeBBox(_all_prior_bboxes[j], _all_prior_variances[j], _info.code_type(), _info.variance_encoded_in_target(), clip_bbox, label_loc_preds[j], _all_decode_bboxes[i][label][j]);
+            }
+        }
+    }
+
+    int num_kept = 0;
+
+    for(int i = 0; i < _num; ++i)
+    {
+        const LabelBBox &decode_bboxes = _all_decode_bboxes[i];
+        const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i];
+
+        std::map<int, std::vector<int>> indices;
+        int num_det = 0;
+        for(int c = 0; c < _info.num_classes(); ++c)
+        {
+            if(c == _info.background_label_id())
+            {
+                // Ignore background class
+                continue;
+            }
+            const int label = _info.share_location() ? -1 : c;
+            if(conf_scores.find(c) == conf_scores.end() || decode_bboxes.find(label) == decode_bboxes.end())
+            {
+                ARM_COMPUTE_ERROR("Could not find predictions for label %d.", label);
+            }
+            const std::vector<float>          &scores = conf_scores.find(c)->second;
+            const std::vector<NormalizedBBox> &bboxes = decode_bboxes.find(label)->second;
+
+            ApplyNMSFast(bboxes, scores, _info.confidence_threshold(), _info.nms_threshold(), _info.eta(), _info.top_k(), indices[c]);
+
+            num_det += indices[c].size();
+        }
+
+        int num_to_add = 0;
+        if(_info.keep_top_k() > -1 && num_det > _info.keep_top_k())
+        {
+            std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+            for(auto it : indices)
+            {
+                const int               label         = it.first;
+                const std::vector<int> &label_indices = it.second;
+
+                if(conf_scores.find(label) == conf_scores.end())
+                {
+                    ARM_COMPUTE_ERROR("Could not find predictions for label %d.", label);
+                }
+
+                const std::vector<float> &scores = conf_scores.find(label)->second;
+                for(auto idx : label_indices)
+                {
+                    ARM_COMPUTE_ERROR_ON(idx > static_cast<int>(scores.size()));
+                    score_index_pairs.push_back(std::make_pair(scores[idx], std::make_pair(label, idx)));
+                }
+            }
+
+            // Keep top k results per image.
+            std::sort(score_index_pairs.begin(), score_index_pairs.end(), SortScorePairDescend<std::pair<int, int>>);
+            score_index_pairs.resize(_info.keep_top_k());
+
+            // Store the new indices.
+
+            std::map<int, std::vector<int>> new_indices;
+            for(auto score_index_pair : score_index_pairs)
+            {
+                int label = score_index_pair.second.first;
+                int idx   = score_index_pair.second.second;
+                new_indices[label].push_back(idx);
+            }
+            _all_indices[i] = new_indices;
+            num_to_add      = _info.keep_top_k();
+        }
+        else
+        {
+            _all_indices[i] = indices;
+            num_to_add      = num_det;
+        }
+        num_kept += num_to_add;
+    }
+
+    //Update the valid region of the ouput to mark the exact number of detection
+    _output->info()->set_valid_region(ValidRegion(Coordinates(0, 0), TensorShape(7, num_kept)));
+
+    int count = 0;
+    for(int i = 0; i < _num; ++i)
+    {
+        const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i];
+        const LabelBBox &decode_bboxes = _all_decode_bboxes[i];
+        for(auto &it : _all_indices[i])
+        {
+            const int                 label     = it.first;
+            const std::vector<float> &scores    = conf_scores.find(label)->second;
+            const int                 loc_label = _info.share_location() ? -1 : label;
+            if(conf_scores.find(label) == conf_scores.end() || decode_bboxes.find(loc_label) == decode_bboxes.end())
+            {
+                // Either if there are no confidence predictions
+                // or there are no location predictions for current label.
+                ARM_COMPUTE_ERROR("Could not find predictions for the label %d.", label);
+            }
+            const std::vector<NormalizedBBox> &bboxes  = decode_bboxes.find(loc_label)->second;
+            const std::vector<int>            &indices = it.second;
+
+            for(auto idx : indices)
+            {
+                *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7))))     = i;
+                *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 1)))) = label;
+                *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 2)))) = scores[idx];
+                *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 3)))) = bboxes[idx][0];
+                *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 4)))) = bboxes[idx][1];
+                *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 5)))) = bboxes[idx][2];
+                *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 6)))) = bboxes[idx][3];
+
+                ++count;
+            }
+        }
+    }
+}
+} // namespace arm_compute

diff --git a/src/runtime/CPP/functions/CPPTopKV.cpp b/src/runtime/CPP/functions/CPPTopKV.cpp
new file mode 100644
index 0000000..c4e1eab
--- /dev/null
+++ b/src/runtime/CPP/functions/CPPTopKV.cpp

@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPP/functions/CPPTopKV.h"
+
+#include "arm_compute/core/CPP/kernels/CPPTopKVKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CPPTopKV::configure(const ITensor *predictions, const ITensor *targets, ITensor *output, const unsigned int k)
+{
+    auto kernel = arm_compute::support::cpp14::make_unique<CPPTopKVKernel>();
+    kernel->configure(predictions, targets, output, k);
+    _kernel = std::move(kernel);
+}
+
+Status CPPTopKV::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+{
+    return CPPTopKVKernel::validate(predictions, targets, output, k);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp
index ac19d08..f3355a7 100644
--- a/src/runtime/CPUUtils.cpp
+++ b/src/runtime/CPUUtils.cpp

@@ -39,7 +39,8 @@
 #include <unistd.h>
 
 #ifndef BARE_METAL
-#include <regex>
+/* C++ std::regex takes up a lot of space in the standalone builds */
+#include <regex.h>
 #include <thread>
 #endif /* BARE_METAL */
 
@@ -94,6 +95,7 @@
             return false;
     }
 }
+
 /* Convert an MIDR register value to a CPUModel enum value. */
 CPUModel midr_to_model(const unsigned int midr)
 {
@@ -144,6 +146,19 @@
                 break;
         }
     }
+    else if(implementer == 0x48) // HiSilicon CPUs
+    {
+        // Only CPUs we have code paths for are detected.  All other CPUs can be safely classed as "GENERIC"
+        switch(cpunum)
+        {
+            case 0xd40: // A76 (Kirin 980)
+                model = CPUModel::GENERIC_FP16_DOT;
+                break;
+            default:
+                model = CPUModel::GENERIC;
+                break;
+        }
+    }
 
     return model;
 }
@@ -172,12 +187,27 @@
 
 void populate_models_cpuinfo(std::vector<CPUModel> &cpusv)
 {
+    regex_t proc_regex;
+    regex_t imp_regex;
+    regex_t var_regex;
+    regex_t part_regex;
+    regex_t rev_regex;
+
+    memset(&proc_regex, 0, sizeof(regex_t));
+    memset(&imp_regex, 0, sizeof(regex_t));
+    memset(&var_regex, 0, sizeof(regex_t));
+    memset(&part_regex, 0, sizeof(regex_t));
+    memset(&rev_regex, 0, sizeof(regex_t));
+
+    int ret_status = 0;
     // If "long-form" cpuinfo is present, parse that to populate models.
-    std::regex proc_regex(R"(^processor.*(\d+)$)");
-    std::regex imp_regex(R"(^CPU implementer.*0x(..)$)");
-    std::regex var_regex(R"(^CPU variant.*0x(.)$)");
-    std::regex part_regex(R"(^CPU part.*0x(...)$)");
-    std::regex rev_regex(R"(^CPU revision.*(\d+)$)");
+    ret_status |= regcomp(&proc_regex, R"(^processor.*([[:digit:]]+)$)", REG_EXTENDED);
+    ret_status |= regcomp(&imp_regex, R"(^CPU implementer.*0x(..)$)", REG_EXTENDED);
+    ret_status |= regcomp(&var_regex, R"(^CPU variant.*0x(.)$)", REG_EXTENDED);
+    ret_status |= regcomp(&part_regex, R"(^CPU part.*0x(...)$)", REG_EXTENDED);
+    ret_status |= regcomp(&rev_regex, R"(^CPU revision.*([[:digit:]]+)$)", REG_EXTENDED);
+    ARM_COMPUTE_UNUSED(ret_status);
+    ARM_COMPUTE_ERROR_ON_MSG(ret_status != 0, "Regex compilation failed.");
 
     std::ifstream file;
     file.open("/proc/cpuinfo", std::ios::in);
@@ -190,11 +220,11 @@
 
         while(bool(getline(file, line)))
         {
-            std::smatch match;
-
-            if(std::regex_match(line, match, proc_regex))
+            regmatch_t match[2];
+            ret_status = regexec(&proc_regex, line.c_str(), 2, match, 0);
+            if(ret_status == 0)
             {
-                std::string id     = match[1];
+                std::string id     = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
                 int         newcpu = support::cpp11::stoi(id, nullptr);
 
                 if(curcpu >= 0 && midr == 0)
@@ -214,32 +244,44 @@
                 continue;
             }
 
-            if(std::regex_match(line, match, imp_regex))
+            ret_status = regexec(&imp_regex, line.c_str(), 2, match, 0);
+            if(ret_status == 0)
             {
-                int impv = support::cpp11::stoi(match[1], nullptr, support::cpp11::NumericBase::BASE_16);
+                std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
+                int         impv   = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
                 midr |= (impv << 24);
+
                 continue;
             }
 
-            if(std::regex_match(line, match, var_regex))
+            ret_status = regexec(&var_regex, line.c_str(), 2, match, 0);
+            if(ret_status == 0)
             {
-                int varv = support::cpp11::stoi(match[1], nullptr, support::cpp11::NumericBase::BASE_16);
+                std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
+                int         varv   = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
                 midr |= (varv << 20);
+
                 continue;
             }
 
-            if(std::regex_match(line, match, part_regex))
+            ret_status = regexec(&part_regex, line.c_str(), 2, match, 0);
+            if(ret_status == 0)
             {
-                int partv = support::cpp11::stoi(match[1], nullptr, support::cpp11::NumericBase::BASE_16);
+                std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
+                int         partv  = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
                 midr |= (partv << 4);
+
                 continue;
             }
 
-            if(std::regex_match(line, match, rev_regex))
+            ret_status = regexec(&rev_regex, line.c_str(), 2, match, 0);
+            if(ret_status == 0)
             {
-                int regv = support::cpp11::stoi(match[1], nullptr);
+                std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
+                int         regv   = support::cpp11::stoi(subexp, nullptr);
                 midr |= (regv);
                 midr |= (0xf << 16);
+
                 continue;
             }
         }
@@ -249,6 +291,13 @@
             cpusv[curcpu] = midr_to_model(midr);
         }
     }
+
+    // Free allocated memory
+    regfree(&proc_regex);
+    regfree(&imp_regex);
+    regfree(&var_regex);
+    regfree(&part_regex);
+    regfree(&rev_regex);
 }
 
 int get_max_cpus()
@@ -364,8 +413,11 @@
     std::map<std::string, unsigned int> cpu_part_occurrence_map;
 
     // CPU part regex
-    std::regex  cpu_part_rgx(R"(.*CPU part.+?(?=:).+?(?=\w+)(\w+).*)");
-    std::smatch cpu_part_match;
+    regex_t cpu_part_rgx;
+    memset(&cpu_part_rgx, 0, sizeof(regex_t));
+    int ret_status = regcomp(&cpu_part_rgx, R"(.*CPU part.+/?\:[[:space:]]+([[:alnum:]]+).*)", REG_EXTENDED);
+    ARM_COMPUTE_UNUSED(ret_status);
+    ARM_COMPUTE_ERROR_ON_MSG(ret_status != 0, "Regex compilation failed.");
 
     // Read cpuinfo and get occurrence of each core
     std::ifstream cpuinfo;
@@ -375,9 +427,11 @@
         std::string line;
         while(bool(getline(cpuinfo, line)))
         {
-            if(std::regex_search(line.cbegin(), line.cend(), cpu_part_match, cpu_part_rgx))
+            regmatch_t match[2];
+            ret_status = regexec(&cpu_part_rgx, line.c_str(), 2, match, 0);
+            if(ret_status == 0)
             {
-                std::string cpu_part = cpu_part_match[1];
+                std::string cpu_part = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
                 if(cpu_part_occurrence_map.find(cpu_part) != cpu_part_occurrence_map.end())
                 {
                     cpu_part_occurrence_map[cpu_part]++;
@@ -389,6 +443,7 @@
             }
         }
     }
+    regfree(&cpu_part_rgx);
 
     // Get min number of threads
     auto min_common_cores = std::min_element(cpu_part_occurrence_map.begin(), cpu_part_occurrence_map.end(),

diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
index c58d184..a35a18a 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -170,7 +170,7 @@
     {
         BorderSize border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
         input->info()->extend_padding(border_size);
-        _fill_border.configure(input, border_size, BorderMode::CONSTANT, PixelValue(0)); // for PAD of im2col fp16: consider it as border
+        _fill_border.configure(input, border_size, BorderMode::CONSTANT, PixelValue()); // for PAD of im2col fp16: consider it as border
     }
     // Configure im2col
     _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation);

diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
index 689d8be..aa937a6 100755
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,7 +53,7 @@
     for(unsigned int i = 0; i < _num_inputs; i++)
     {
         _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
-        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue());
 
         depth_offset += inputs_vector.at(i)->info()->dimension(2);
     }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
index d9aa50d..ba05838 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,7 @@
     _kernel = std::move(k);
 
     // Configure border handler
-    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
 
     _shift_handler.configure(input);
 

diff --git a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
index c0cf098..cb14b8a 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -68,7 +68,7 @@
         return;
     }
 
-    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
 
     _shift_handler.configure(input);
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
index b2e69ee..2569365 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,7 +48,7 @@
     _norm_kernel.configure(input, &_squared_input, output, norm_info);
     _multiply_kernel.configure(input, input, &_squared_input, 1.0f);
     // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
-    _border_handler.configure(&_squared_input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+    _border_handler.configure(&_squared_input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue());
 
     // Allocate intermediate buffers
     _squared_input.allocator()->allocate();

diff --git a/src/runtime/ISimpleLifetimeManager.cpp b/src/runtime/ISimpleLifetimeManager.cpp
index 7d928d6..97c20d1 100644
--- a/src/runtime/ISimpleLifetimeManager.cpp
+++ b/src/runtime/ISimpleLifetimeManager.cpp

@@ -59,7 +59,7 @@
     // Check if there is a free blob
     if(_free_blobs.empty())
     {
-        _occupied_blobs.emplace_front(Blob{ obj, 0, { obj } });
+        _occupied_blobs.emplace_front(Blob{ obj, 0, 0, { obj } });
     }
     else
     {
@@ -71,7 +71,7 @@
     _active_elements.insert(std::make_pair(obj, obj));
 }
 
-void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t size)
+void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t size, size_t alignment)
 {
     ARM_COMPUTE_ERROR_ON(obj == nullptr);
 
@@ -80,10 +80,11 @@
     ARM_COMPUTE_ERROR_ON(active_object_it == std::end(_active_elements));
 
     // Update object fields and mark object as complete
-    Element &el = active_object_it->second;
-    el.handle   = &obj_memory;
-    el.size     = size;
-    el.status   = true;
+    Element &el  = active_object_it->second;
+    el.handle    = &obj_memory;
+    el.size      = size;
+    el.alignment = alignment;
+    el.status    = true;
 
     // Find object in the occupied lists
     auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs), [&obj](const Blob & b)
@@ -94,8 +95,9 @@
 
     // Update occupied blob and return as free
     occupied_blob_it->bound_elements.insert(obj);
-    occupied_blob_it->max_size = std::max(occupied_blob_it->max_size, size);
-    occupied_blob_it->id       = nullptr;
+    occupied_blob_it->max_size      = std::max(occupied_blob_it->max_size, size);
+    occupied_blob_it->max_alignment = std::max(occupied_blob_it->max_alignment, alignment);
+    occupied_blob_it->id            = nullptr;
     _free_blobs.splice(std::begin(_free_blobs), _occupied_blobs, occupied_blob_it);
 
     // Check if all object are finalized and reset active group

diff --git a/src/runtime/MEMUtils.cpp b/src/runtime/MEMUtils.cpp
index ad00070..5ae1c2a 100644
--- a/src/runtime/MEMUtils.cpp
+++ b/src/runtime/MEMUtils.cpp

@@ -27,7 +27,7 @@
 
 #ifndef BARE_METAL
 #include <fstream>
-#include <regex>
+#include <iterator>
 #include <sstream>
 #endif // ifndef BARE_METAL
 
@@ -43,41 +43,33 @@
     size_t        memfree  = 0;
     std::ifstream meminfo_f;
     meminfo_f.open("/proc/meminfo", std::ios::in);
+
     if(meminfo_f.is_open())
     {
-        std::stringstream str_stream;
-        str_stream << meminfo_f.rdbuf();
-        const std::string str = str_stream.str();
-        try
+        std::string line;
+        while(bool(getline(meminfo_f, line)))
         {
-            std::smatch match;
-            if(std::regex_search(str, match, std::regex("MemTotal: (.*)kB")) && match.size() > 1)
+            std::istringstream       iss(line);
+            std::vector<std::string> tokens((std::istream_iterator<std::string>(iss)),
+                                            std::istream_iterator<std::string>());
+            if(tokens[0] == "MemTotal:")
             {
-                const std::string result = match.str(1);
-                total                    = std::stoul(result, nullptr, 0);
+                total = arm_compute::support::cpp11::stoul(tokens[1], nullptr);
             }
-            if(std::regex_search(str, match, std::regex("MemFree: (.*)kB")) && match.size() > 1)
+            else if(tokens[0] == "MemFree:")
             {
-                const std::string result = match.str(1);
-                memfree                  = std::stoul(result, nullptr, 0);
+                memfree = arm_compute::support::cpp11::stoul(tokens[1], nullptr);
             }
-            if(std::regex_search(str, match, std::regex("Buffers: (.*)kB")) && match.size() > 1)
+            else if(tokens[0] == "Buffers:")
             {
-                const std::string result = match.str(1);
-                buffer                   = std::stoul(result, nullptr, 0);
+                buffer = arm_compute::support::cpp11::stoul(tokens[1], nullptr);
             }
-            if(std::regex_search(str, match, std::regex("Cached: (.*)kB")) && match.size() > 1)
+            else if(tokens[0] == "Cached:")
             {
-                const std::string result = match.str(1);
-                memcache                 = std::stoul(result, nullptr, 0);
+                memcache = arm_compute::support::cpp11::stoul(tokens[1], nullptr);
             }
-            free = memfree + (buffer + memcache);
         }
-        catch(std::regex_error &e)
-        {
-            // failed parsing /proc/meminfo
-            // return 0s on all fields
-        }
+        free = memfree + (buffer + memcache);
     }
 #endif // ifndef BARE_METAL
 }

diff --git a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
new file mode 100644
index 0000000..1287204
--- /dev/null
+++ b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp

@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+INESimpleFunctionNoBorder::INESimpleFunctionNoBorder() // NOLINT
+    : _kernel()
+{
+}
+
+void INESimpleFunctionNoBorder::run()
+{
+    NEScheduler::get().schedule(_kernel.get(), Window::DimY);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
new file mode 100644
index 0000000..d33e134
--- /dev/null
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp

@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduction_kernel(), _fill_border_kernel(), _run_fill_border(false)
+{
+}
+void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, const ReductionOperation &op)
+{
+    _reduction_kernel.configure(input, output, axis, op);
+
+    if(axis == 0)
+    {
+        _fill_border_kernel.configure(input, _reduction_kernel.border_size(), BorderMode::REPLICATE);
+        _run_fill_border = true;
+    }
+}
+
+Status NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
+    ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output, axis, op));
+    return Status{};
+}
+
+void NEArgMinMaxLayer::run()
+{
+    _memory_group.acquire();
+
+    if(_run_fill_border)
+    {
+        NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
+    }
+    NEScheduler::get().schedule(&_reduction_kernel, Window::DimY);
+
+    _memory_group.release();
+}
+
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
index 677e9f6..b155077 100644
--- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,16 +36,6 @@
     auto k = arm_compute::support::cpp14::make_unique<NEArithmeticAdditionKernel>();
     k->configure(input1, input2, output, policy);
     _kernel = std::move(k);
-
-    if(output->info()->dimension(0) > 1)
-    {
-        ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
-        if(broadcasted_info->info()->dimension(0) == 1)
-        {
-            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-        }
-    }
 }
 Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
 {

diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 931e5db..5059162 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -148,8 +148,7 @@
         return (*found).second;
     }
 
-    if(dilation != Size2D(1U, 1U) || Scheduler::get().cpu_info().get_cpu_model() == CPUModel::A53
-       || input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) <= 16)
+    if(dilation != Size2D(1U, 1U) || input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) <= 16)
     {
         return ConvolutionMethod::GEMM;
     }

diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 6887a0a..44d7197 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -145,6 +145,15 @@
     _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
     _scaled_output.allocator()->allocate();
 }
+Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info)
+{
+    return NEDeconvolutionLayer::validate(input, weights, bias, output, info, 0, 0);
+}
+
+void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info)
+{
+    configure(input, weights, bias, output, info, 0, 0);
+}
 
 void NEDeconvolutionLayer::run()
 {

diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index a2f0094..f0fd4cf 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -72,7 +72,7 @@
             accum_layout = DataLayout::NCHW;
         }
 
-        _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, input->info()->quantization_info()));
+        _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info()));
         _accumulator.info()->set_data_layout(accum_layout);
         zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
     }
@@ -271,7 +271,7 @@
     const unsigned int channel_idx = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
     ARM_COMPUTE_UNUSED(channel_idx);
 
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_ERROR_ON((input->info()->dimension(channel_idx) * depth_multiplier) != weights->info()->dimension(channel_idx));
 

diff --git a/src/runtime/NEON/functions/NEElementwiseOperators.cpp b/src/runtime/NEON/functions/NEElementwiseOperators.cpp
new file mode 100644
index 0000000..74c1957
--- /dev/null
+++ b/src/runtime/NEON/functions/NEElementwiseOperators.cpp

@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEElementwiseOperations.h"
+#include <arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h>
+
+#include "arm_compute/core/ITensor.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NEElementwiseMax::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::MAX, input1, input2, output);
+    _kernel = std::move(k);
+}
+
+Status NEElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return NEArithmeticOperationKernel::validate(ArithmeticOperation::MAX, input1, input2, output);
+}
+
+void NEElementwiseMin::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::MIN, input1, input2, output);
+    _kernel = std::move(k);
+}
+
+Status NEElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return NEArithmeticOperationKernel::validate(ArithmeticOperation::MIN, input1, input2, output);
+}
+
+void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+    _kernel = std::move(k);
+}
+
+Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return NEArithmeticOperationKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+}
+
+void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEDivisionOperationKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
+
+Status NEElementwiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return NEDivisionOperationKernel::validate(input1, input2, output);
+}
+
+template <ComparisonOperation COP>
+void NEElementwiseComparisonStatic<COP>::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEComparisonOperationKernel>();
+    k->configure(COP, input1, input2, output);
+    _kernel = std::move(k);
+}
+
+template <ComparisonOperation COP>
+Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return NEComparisonOperationKernel::validate(COP, input1, input2, output);
+}
+
+void NEElementwiseComparison::configure(ITensor *input1, ITensor *input2, ITensor *output, ComparisonOperation op)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEComparisonOperationKernel>();
+    k->configure(op, input1, input2, output);
+    _kernel = std::move(k);
+}
+
+Status NEElementwiseComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op)
+{
+    return NEComparisonOperationKernel::validate(op, input1, input2, output);
+}
+
+// Supported Specializations
+template class NEElementwiseComparisonStatic<ComparisonOperation::Equal>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::NotEqual>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::Greater>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::Less>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
new file mode 100644
index 0000000..10142c7
--- /dev/null
+++ b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp

@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h"
+
+#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NERsqrtLayer::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernel>();
+    k->configure(ElementWiseUnary::RSQRT, input, output);
+    _kernel = std::move(k);
+}
+Status NERsqrtLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return NEElementwiseUnaryKernel::validate(ElementWiseUnary::RSQRT, input, output);
+}
+
+void NEExpLayer::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernel>();
+    k->configure(ElementWiseUnary::EXP, input, output);
+    _kernel = std::move(k);
+}
+Status NEExpLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return NEElementwiseUnaryKernel::validate(ElementWiseUnary::EXP, input, output);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
new file mode 100644
index 0000000..dc48731
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp

@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEFuseBatchNormalization::NEFuseBatchNormalization()
+    : _fuse_bn_kernel()
+{
+}
+
+void NEFuseBatchNormalization::configure(const ITensor *conv_weights, const ITensor *bn_mean, const ITensor *bn_var,
+                                         ITensor *fused_weights, ITensor *fused_bias,
+                                         const ITensor *conv_bias, const ITensor *bn_beta, const ITensor *bn_gamma,
+                                         float epsilon)
+{
+    _fuse_bn_kernel.configure(conv_weights, bn_mean, bn_var, fused_weights, fused_bias, conv_bias, bn_beta, bn_gamma, epsilon);
+}
+
+Status NEFuseBatchNormalization::validate(const ITensorInfo *conv_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
+                                          const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
+                                          const ITensorInfo *conv_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
+                                          float epsilon)
+{
+    return NEFuseBatchNormalizationKernel::validate(conv_weights, bn_mean, bn_var, fused_weights, fused_bias, conv_bias, bn_beta, bn_gamma, epsilon);
+}
+
+void NEFuseBatchNormalization::run()
+{
+    NEScheduler::get().schedule(&_fuse_bn_kernel, Window::DimY);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 72a3e80..914f088 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -91,8 +91,8 @@
             shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
             shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
 
-            TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
-            TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+            TensorInfo info_a = a->info()->clone()->set_tensor_shape(shape_tmp_a).set_is_resizable(true);
+            TensorInfo info_b = b->info()->clone()->set_tensor_shape(shape_tmp_b).set_is_resizable(true);
 
             _tmp_a.allocator()->init(info_a);
             _tmp_b.allocator()->init(info_b);

diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index 922f757..470e922 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,6 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
 
 #include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h"
 #include "arm_compute/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h"
@@ -38,14 +35,14 @@
 {
 namespace
 {
-std::unique_ptr<IFunction> create_function_all_types(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
+std::unique_ptr<IFunction> create_function_all_types(arm_gemm::KernelDescription gemm_kernel_info,
+                                                     const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
                                                      std::shared_ptr<IMemoryManager> memory_manager)
 
 {
     //Note: It's safe to not check for FP16 support because this was already checked in NEGEMMAssemblyDispatch::configure()
-    switch(method)
+    switch(gemm_kernel_info.method)
     {
-        case arm_gemm::GemmMethod::GEMM_INTERLEAVED_FP16:
         case arm_gemm::GemmMethod::GEMM_INTERLEAVED:
         {
             if(!pretranspose_hint)
@@ -56,99 +53,41 @@
             function->configure(a, b, d, alpha, beta, pretranspose_hint);
             return std::move(function);
         }
-        default:
-            return nullptr;
-    }
-}
-
-template <typename TypeInput, typename TypeOutput>
-std::unique_ptr<IFunction> create_function(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
-                                           std::shared_ptr<IMemoryManager> memory_manager)
-{
-    ARM_COMPUTE_UNUSED(method);
-    ARM_COMPUTE_UNUSED(a);
-    ARM_COMPUTE_UNUSED(b);
-    ARM_COMPUTE_UNUSED(d);
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_UNUSED(beta);
-    ARM_COMPUTE_UNUSED(pretranspose_hint);
-    ARM_COMPUTE_UNUSED(memory_manager);
-    return nullptr;
-}
-
-#ifdef __aarch64__
-template <>
-std::unique_ptr<IFunction> create_function<int8_t, int32_t>(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
-                                                            std::shared_ptr<IMemoryManager> memory_manager)
-{
-    switch(method)
-    {
-        case arm_gemm::GemmMethod::GEMM_INTERLEAVED_DOT:
-        {
-            if(!pretranspose_hint)
-            {
-                return nullptr;
-            }
-            auto function = support::cpp14::make_unique<NEGEMMInterleavedWrapper>(memory_manager);
-            function->configure(a, b, d, alpha, beta, pretranspose_hint, true /* use_dot */);
-            return std::move(function);
-        }
-        default:
-            return nullptr;
-    }
-    return nullptr;
-}
-
-template <>
-std::unique_ptr<IFunction> create_function<uint8_t, uint32_t>(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
-                                                              std::shared_ptr<IMemoryManager> memory_manager)
-{
-    switch(method)
-    {
-        case arm_gemm::GemmMethod::GEMM_INTERLEAVED_DOT:
-        {
-            if(!pretranspose_hint)
-            {
-                return nullptr;
-            }
-            auto function = support::cpp14::make_unique<NEGEMMInterleavedWrapper>(memory_manager);
-            function->configure(a, b, d, alpha, beta, pretranspose_hint, true /* use_dot */);
-            return std::move(function);
-        }
-        default:
-            return nullptr;
-    }
-    return nullptr;
-}
-
-template <>
-std::unique_ptr<IFunction> create_function<float, float>(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
-                                                         std::shared_ptr<IMemoryManager> memory_manager)
-{
-    ARM_COMPUTE_UNUSED(pretranspose_hint);
-    ARM_COMPUTE_UNUSED(memory_manager);
-    switch(method)
-    {
+#if defined(__aarch64__)
         case arm_gemm::GemmMethod::GEMM_NATIVE:
         {
-            auto kernel = support::cpp14::make_unique<NEGEMMNativeWrapperKernel<float, float>>();
-            kernel->configure(a, b, d, alpha, beta);
-            auto function = support::cpp14::make_unique<NESimpleAssemblyFunction>();
-            function->configure(std::move(kernel));
-            return std::move(function);
+            if(gemm_kernel_info.name.find("sgemm_native_16x4") != std::string::npos)
+            {
+                auto kernel = support::cpp14::make_unique<NEGEMMNativeWrapperKernel<float, float>>();
+                kernel->configure(a, b, d, alpha, beta);
+                auto function = support::cpp14::make_unique<NESimpleAssemblyFunction>();
+                function->configure(std::move(kernel));
+                return std::move(function);
+            }
+            return nullptr;
         }
+#endif // defined(__aarch64__)
         default:
             return nullptr;
     }
 }
-#endif /* __aarch64__ */
 
 /** Fallback in case ACL doesn't have a function */
 template <typename TypeInput, typename TypeOutput>
 class Fallback : public NEGEMMAssemblyDispatch::IFallback
 {
 public:
-    void configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs<TypeOutput> &args, MemoryGroup &memory_group);
+    /** Initialise the functions's input and output.
+     *
+     * @param[in]  a            Input tensor containing the Matrix A.
+     * @param[in]  b            Input tensor containing the Matrix B.
+     * @param[out] d            Output tensor to store the result of matrix multiplication.
+     * @param[in]  args         Matrix multiplication information.
+     * @param[in]  memory_group Memory group to be used by the function.
+     */
+    void configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs<TypeOutput> args, MemoryGroup &memory_group);
+
+    // Inherited methods overridden:
     void run() override;
     void prepare() override;
     bool is_configured() const override;
@@ -187,9 +126,16 @@
 };
 
 template <typename TypeInput, typename TypeOutput>
-void Fallback<TypeInput, TypeOutput>::configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs<TypeOutput> &args, MemoryGroup &memory_group)
+void Fallback<TypeInput, TypeOutput>::configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs<TypeOutput> args, MemoryGroup &memory_group)
 {
-    _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput>(args, nullptr);
+    arm_gemm::GemmConfig              gemm_cfg;
+    const arm_gemm::KernelDescription gemm_kernel_info = arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args);
+    if(gemm_kernel_info.method != arm_gemm::GemmMethod::GEMV_BATCHED)
+    {
+        gemm_cfg.filter = gemm_kernel_info.name;
+        args._cfg       = &gemm_cfg;
+    }
+    _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput>(args);
     if(_gemm_kernel_asm == nullptr)
     {
         //configuration not supported: Leave function unconfigured:
@@ -199,7 +145,7 @@
     // arm_compute wrapper for the Gemm object (see above)
     std::unique_ptr<NEGEMMAssemblyWrapperKernel<TypeInput, TypeOutput>> acl_gemm_wrapper = support::cpp14::make_unique<NEGEMMAssemblyWrapperKernel<TypeInput, TypeOutput>>();
     ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr);
-    acl_gemm_wrapper->configure(_gemm_kernel_asm.get());
+    acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.filter);
     const size_t workspace_size = _gemm_kernel_asm->get_working_size();
     if(workspace_size > 0)
     {
@@ -229,8 +175,6 @@
         const unsigned int alignment           = 128;
         const size_t       B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
         _pretranspose.allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
-        _pretranspose.allocator()->allocate();
-        ARM_COMPUTE_ERROR_ON_NULLPTR(_pretranspose.buffer());
     }
 }
 
@@ -242,6 +186,7 @@
         // Pretranspose B if required
         if(_gemm_kernel_asm->B_pretranspose_required())
         {
+            _pretranspose.allocator()->allocate();
             ARM_COMPUTE_ERROR_ON(_pretranspose.buffer() == nullptr);
             const int  ldb            = _b->info()->strides_in_bytes().y() / sizeof(TypeInput);
             const auto in1_ptr        = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes());
@@ -335,12 +280,8 @@
     arm_gemm::GemmArgs<TypeOutput> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint);
 
     //Try to create an ACL function:
-    acl_function = create_function_all_types(arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args), a, b, d, alpha, beta, pretranspose_hint, memory_manager);
-    // If the type agnostic factory failed to create an ACL function, try the specialised one:
-    if(acl_function == nullptr)
-    {
-        acl_function = create_function<TypeInput, TypeOutput>(arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args), a, b, d, alpha, beta, pretranspose_hint, memory_manager);
-    }
+    acl_function = create_function_all_types(arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args), a, b, d, alpha, beta, pretranspose_hint, std::move(memory_manager));
+
     //If we still don't have an ACL function:
     if(acl_function == nullptr)
     {

diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 0232a83..be7cc2d 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp

@@ -90,7 +90,7 @@
 }
 
 NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
-    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
+    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
       _add_bias_kernel(), _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _append_bias(false),
       _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
 {

diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 4b02694..5286f11 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -97,9 +97,9 @@
         else
         {
             // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-            TensorInfo info_a(compute_interleaved_shape(*a->info()), 1, a->info()->data_type());
+            TensorInfo info_a = a->info()->clone()->set_tensor_shape(compute_interleaved_shape(*a->info())).set_is_resizable(true);
             // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-            TensorInfo info_b(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type());
+            TensorInfo info_b = b->info()->clone()->set_tensor_shape(compute_transpose1xW_shape(*b->info())).set_is_resizable(true);
             _tmp_a.allocator()->init(info_a);
             _tmp_b.allocator()->init(info_b);
             _memory_group.manage(&_tmp_a);
@@ -241,8 +241,8 @@
             shape_tmp_b.set(0, b->dimension(1) * 16);
             shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
 
-            TensorInfo info_a(shape_tmp_a, 1, a->data_type());
-            TensorInfo info_b(shape_tmp_b, 1, b->data_type());
+            TensorInfo info_a = a->clone()->set_tensor_shape(shape_tmp_a).set_is_resizable(true);
+            TensorInfo info_b = b->clone()->set_tensor_shape(shape_tmp_b).set_is_resizable(true);
 
             ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
             ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));

diff --git a/src/runtime/NEON/functions/NEGather.cpp b/src/runtime/NEON/functions/NEGather.cpp
new file mode 100644
index 0000000..078bd5a
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGather.cpp

@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGather.h"
+
+#include "arm_compute/core/NEON/kernels/NEGatherKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NEGather::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEGatherKernel>();
+    k->configure(input, indices, output, axis);
+    _kernel = std::move(k);
+}
+
+Status NEGather::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+{
+    return NEGatherKernel::validate(input, indices, output, axis);
+}
+
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEIntegralImage.cpp b/src/runtime/NEON/functions/NEIntegralImage.cpp
index fa8aaeb..8645b43 100644
--- a/src/runtime/NEON/functions/NEIntegralImage.cpp
+++ b/src/runtime/NEON/functions/NEIntegralImage.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,5 +36,5 @@
     auto k = arm_compute::support::cpp14::make_unique<NEIntegralImageKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
-    _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, static_cast<float>(0.f));
+    _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
 }

diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index d0b80fb..56da966 100644
--- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp

@@ -26,8 +26,8 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 NEL2NormalizeLayer::NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
 {
@@ -57,8 +57,8 @@
 
     ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperation::validate(input, &sum_sq, axis, ReductionOperation::SUM_SQUARE));
 
-    // Reduce shape on axis (supported axis is 0)
-    shape.set(0, 1);
+    // Reduce shape on axis
+    shape.set(axis, 1);
     sum_sq.set_tensor_shape(shape);
 
     ARM_COMPUTE_RETURN_ON_ERROR(NEL2NormalizeLayerKernel::validate(input, &sum_sq, output, axis, epsilon));
@@ -75,3 +75,4 @@
 
     _memory_group.release();
 }
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp
index 7c7580a..9e7a713 100644
--- a/src/runtime/NEON/functions/NELSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -111,8 +111,8 @@
     _forget_gate_out2.allocator()->allocate();
     _memory_group.manage(&_forget_gate_out5);
     _accum_forget_gate1.configure(&_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
+    _forget_gate_out1.allocator()->allocate();
     Tensor *forget_gate_out = &_forget_gate_out5;
-
     if(lstm_params.has_peephole_opt())
     {
         _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
@@ -129,18 +129,18 @@
     {
         _forget_gate_out3.allocator()->allocate();
     }
-    _activation_forget_gate.configure(forget_gate_out, &_forget_gate_out1, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
-    forget_gate_out->allocator()->allocate();
+    _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the input gate
     // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
     // input_gate = 1 - forget_gate, with CIFG
     _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    Tensor *input_gate_out = &_input_gate_out1;
     if(lstm_params.has_cifg_opt())
     {
         _memory_group.manage(&_input_gate_out1);
         _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-        _subtract_input_gate.configure(&_ones, &_forget_gate_out1, &_input_gate_out1, ConvertPolicy::SATURATE);
+        _subtract_input_gate.configure(&_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
         _ones.allocator()->allocate();
         _run_cifg_opt = true;
     }
@@ -162,16 +162,22 @@
         _input_gate_out2.allocator()->allocate();
         _memory_group.manage(&_input_gate_out4);
         _accum_input_gate1.configure(&_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
+        _input_gate_out3.allocator()->allocate();
+        input_gate_out = &_input_gate_out4;
         if(_run_peephole_opt)
         {
             _memory_group.manage(&_input_gate_out5);
             _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
             _accum_input_gate2.configure(&_input_gate_out4, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _input_gate_out4.allocator()->allocate();
             _input_gate_out5.allocator()->allocate();
+            input_gate_out = &_input_gate_out1;
         }
-        _input_gate_out3.allocator()->allocate();
-        _input_gate_out4.allocator()->allocate();
-        _activation_input_gate.configure(&_input_gate_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        else
+        {
+            _input_gate_out1.allocator()->allocate();
+        }
+        _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     }
 
     // Configure block that calculates the cell state
@@ -194,11 +200,9 @@
     _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
     _activation_cell_state.configure(&_cell_state_out4, nullptr, activation_info);
     _memory_group.manage(&_cell_state_out5);
-    _pixelwise_mul_cell_state1.configure(&_cell_state_out4, &_input_gate_out1, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-    _input_gate_out1.allocator()->allocate();
+    _pixelwise_mul_cell_state1.configure(&_cell_state_out4, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _cell_state_out4.allocator()->allocate();
-    _pixelwise_mul_cell_state2.configure(&_forget_gate_out1, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-    _forget_gate_out1.allocator()->allocate();
+    _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
     _cell_state_out3.allocator()->allocate();
     _cell_state_out5.allocator()->allocate();
@@ -246,7 +250,6 @@
         _output1.allocator()->allocate();
     }
     _activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
-    output_gate_out->allocator()->allocate();
 
     // Configure block that calculates the output state
     /** lstm_res = PixelwiseMul(output, Activation(cell_state))
@@ -265,6 +268,7 @@
     _activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info);
     _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _cell_state_activation.allocator()->allocate();
+    output_gate_out->allocator()->allocate();
 
     if(lstm_params.has_projection())
     {
@@ -281,19 +285,22 @@
 
     // Copy cell state and output
     _copy_cell_state.configure(&_cell_state_out1, cell_state_out);
-    _cell_state_out1.allocator()->allocate();
     _copy_output.configure(output_state_out, output);
 
     // Vector for holding the tensors to store in scratch buffer
     std::vector<ITensor *> scratch_inputs;
     if(!lstm_params.has_cifg_opt())
     {
-        scratch_inputs.emplace_back(&_input_gate_out1);
+        scratch_inputs.emplace_back(input_gate_out);
     }
     scratch_inputs.emplace_back(&_cell_state_out1);
     scratch_inputs.emplace_back(forget_gate_out);
     scratch_inputs.emplace_back(output_gate_out);
     _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
+    input_gate_out->allocator()->allocate();
+    _cell_state_out1.allocator()->allocate();
+    forget_gate_out->allocator()->allocate();
+    output_gate_out->allocator()->allocate();
 }
 
 Status NELSTMLayer::validate(const ITensorInfo *input,

diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp
new file mode 100644
index 0000000..f5c2718
--- /dev/null
+++ b/src/runtime/NEON/functions/NEPadLayer.cpp

@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEPadLayer.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+TensorInfo get_expected_output_tensorinfo(const ITensorInfo &input, const PaddingList &paddings)
+{
+    const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input.tensor_shape(), paddings);
+    const TensorInfo  expected_output_info  = input.clone()->set_tensor_shape(expected_output_shape);
+    return expected_output_info;
+}
+
+Status validate_arguments(const ITensorInfo &input, ITensorInfo &output, const PaddingList &paddings)
+{
+    const TensorInfo expected_output_info = get_expected_output_tensorinfo(input, paddings);
+    auto_init_if_empty(output, expected_output_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output, &expected_output_info);
+
+    return Status{};
+}
+
+Coordinates get_subtensor_coords(const PaddingList &paddings)
+{
+    Coordinates coords;
+    for(unsigned int i = 0; i < paddings.size(); ++i)
+    {
+        coords.set(i, paddings[i].first);
+    }
+
+    return coords;
+}
+} // namespace
+
+NEPadLayer::NEPadLayer()
+    : _memset_kernel(), _copy_kernel(), _output_subtensor()
+{
+}
+
+void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &padding, PixelValue constant_value)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_THROW_ON_ERROR(NEPadLayer::validate(input->info(), output->info(), padding, constant_value));
+
+    // Auto-init
+    auto_init_if_empty(*output->info(), get_expected_output_tensorinfo(*input->info(), padding));
+
+    // Create SubTensor (Can use sub-tensor as the kernels to be executed do not require padding)
+    _output_subtensor = SubTensor(output, input->info()->tensor_shape(), get_subtensor_coords(padding), true);
+
+    // Set the pages of the output to the specified value
+    _memset_kernel.configure(output, constant_value);
+
+    // Copy the input to the output
+    _copy_kernel.configure(input, &_output_subtensor);
+}
+
+Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value)
+{
+    ARM_COMPUTE_UNUSED(constant_value);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+    auto output_clone = output->clone();
+
+    SubTensorInfo output_subtensor_info(output_clone.get(), input->tensor_shape(), get_subtensor_coords(padding), true);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output_clone, padding));
+    ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(input, &output_subtensor_info));
+
+    return Status{};
+}
+
+void NEPadLayer::run()
+{
+    NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
index 1f1400c..3aca4b7 100644
--- a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,14 +27,14 @@
 #include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 NEROIPoolingLayer::NEROIPoolingLayer()
     : _roi_kernel()
 {
 }
 
-void NEROIPoolingLayer::configure(const ITensor *input, const IROIArray *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIPoolingLayer::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
 {
     _roi_kernel.configure(input, rois, output, pool_info);
 }
@@ -43,3 +43,4 @@
 {
     NEScheduler::get().schedule(&_roi_kernel, Window::DimX);
 }
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NERange.cpp b/src/runtime/NEON/functions/NERange.cpp
new file mode 100644
index 0000000..977d502
--- /dev/null
+++ b/src/runtime/NEON/functions/NERange.cpp

@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NERange.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NERange::NERange()
+    : _kernel()
+{
+}
+
+void NERange::configure(ITensor *output, const float start, const float end, const float step)
+{
+    _kernel.configure(output, start, end, step);
+}
+
+Status NERange::validate(const ITensorInfo *output, const float start, const float end, const float step)
+{
+    return NERangeKernel::validate(output, start, end, step);
+}
+
+void NERange::run()
+{
+    NEScheduler::get().schedule(&_kernel, Window::DimX);
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index 0b022df..014895f 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -14,9 +14,9 @@
  * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INNEUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY NEAIM, DAMAGES OR OTHER
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
@@ -39,17 +39,38 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
 
-    for(unsigned int i = 0; i < reduction_axis.num_dimensions(); ++i)
-    {
-        if(output->total_size() > 0)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(reduction_axis[i]) != 1);
-            ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(reduction_axis[i]) > input->num_dimensions() - 1);
-        }
+    TensorShape        out_shape     = input->tensor_shape();
+    const unsigned int reduction_ops = reduction_axis.num_dimensions();
+    const int          input_dims    = input->num_dimensions();
+    Coordinates        axis_local    = reduction_axis;
 
-        ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output, reduction_axis[i], ReductionOperation::MEAN_SUM));
+    // Convert negative axis
+    for(unsigned int i = 0; i < reduction_ops; ++i)
+    {
+        axis_local[i] = wrap_around(axis_local[i], input_dims);
     }
 
+    std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+    for(unsigned int i = 0; i < reduction_ops; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+        ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
+        if(output->total_size() > 0 && keep_dims)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+        }
+        if(keep_dims)
+        {
+            out_shape.set(axis_local[i], 1);
+        }
+        else
+        {
+            out_shape.remove_dimension(axis_local[i] - i);
+        }
+    }
+    const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
     return Status{};
 }
 
@@ -62,22 +83,32 @@
     _reduced_outs      = arm_compute::support::cpp14::make_unique<Tensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
     _keep_dims         = keep_dims;
 
+    Coordinates        axis_local    = reduction_axis;
+    const int          input_dims    = input->info()->num_dimensions();
+    const unsigned int reduction_ops = reduction_axis.num_dimensions();
+
+    // Convert negative axis
+    for(unsigned int i = 0; i < reduction_ops; ++i)
+    {
+        axis_local[i] = wrap_around(axis_local[i], input_dims);
+    }
+
     // Perform reduction for every axis
     for(unsigned int i = 0; i < _reduction_ops; ++i)
     {
         TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
-        out_shape.set(reduction_axis[i], 1);
+        out_shape.set(axis_local[i], 1);
         auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
 
         if(i == _reduction_ops - 1 && keep_dims)
         {
-            _reduction_kernels[i].configure(in, output, reduction_axis[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
         }
         else
         {
-            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type()));
+            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
             _memory_group.manage(_reduced_outs.get() + i);
-            _reduction_kernels[i].configure(in, _reduced_outs.get() + i, reduction_axis[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i], ReductionOperation::MEAN_SUM);
         }
     }
 
@@ -91,9 +122,13 @@
     if(!keep_dims)
     {
         TensorShape out_shape = input->info()->tensor_shape();
+
+        // We have to sort the reduction axis vectors in order for remove_dimension
+        // to work properly
+        std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
         for(unsigned int i = 0; i < _reduction_ops; ++i)
         {
-            out_shape.remove_dimension(reduction_axis[i]);
+            out_shape.remove_dimension(axis_local[i] - i);
         }
         auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
         _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);

diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 188c2bb..9f81a40 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,16 +50,6 @@
             ARM_COMPUTE_ERROR("Unsupported reduction axis");
     }
 }
-BorderMode reduction_operation_border_mode(ReductionOperation op)
-{
-    switch(op)
-    {
-        case ReductionOperation::SUM_SQUARE:
-            return BorderMode::CONSTANT;
-        default:
-            return BorderMode::CONSTANT;
-    }
-}
 } // namespace
 
 NEReductionOperation::NEReductionOperation()
@@ -86,9 +76,9 @@
     if(axis == 0)
     {
         // Configure fill border kernel
-        BorderSize fill_border_size = (axis == 0) ? _reduction_kernel.border_size() : BorderSize();
-        BorderMode fill_border_mode = reduction_operation_border_mode(op);
-        _fill_border_kernel.configure(input, fill_border_size, fill_border_mode, PixelValue(static_cast<float>(0.f)));
+        const BorderSize fill_border_size = _reduction_kernel.border_size();
+        const PixelValue pixelValue       = (op == ReductionOperation::PROD) ? PixelValue(1, input->info()->data_type(), input->info()->quantization_info()) : PixelValue(0, input->info()->data_type());
+        _fill_border_kernel.configure(input, fill_border_size, BorderMode::CONSTANT, pixelValue);
     }
 }
 

diff --git a/src/runtime/NEON/functions/NEReverse.cpp b/src/runtime/NEON/functions/NEReverse.cpp
new file mode 100644
index 0000000..139bd50
--- /dev/null
+++ b/src/runtime/NEON/functions/NEReverse.cpp

@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReverse.h"
+
+#include "arm_compute/core/NEON/kernels/NEReverseKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NEReverse::configure(const ITensor *input, ITensor *output, const ITensor *axis)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEReverseKernel>();
+    k->configure(input, output, axis);
+    _kernel = std::move(k);
+}
+
+Status NEReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+{
+    return NEReverseKernel::validate(input, output, axis);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index a9c85bd..483aa4c 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,6 +46,11 @@
 {
     ARM_COMPUTE_ERROR_ON(nullptr == offsets);
     ARM_COMPUTE_UNUSED(sampling_policy);
+    float sampling_offset = 0.0f;
+    if(sampling_policy == SamplingPolicy::CENTER)
+    {
+        sampling_offset = 0.5f;
+    }
 
     Window win;
     win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1));
@@ -60,8 +65,8 @@
 
         execute_window_loop(win, [&](const Coordinates & id)
         {
-            const float in_x  = (id.x() + 0.5f) * wr - 0.5f;
-            const float in_y  = (id.y() + 0.5f) * hr - 0.5f;
+            const float in_x  = (id.x() + sampling_offset) * wr - sampling_offset;
+            const float in_y  = (id.y() + sampling_offset) * hr - sampling_offset;
             const int   in_xi = std::floor(in_x);
             const int   in_yi = std::floor(in_y);
 
@@ -167,14 +172,14 @@
             ARM_COMPUTE_ERROR("Unsupported interpolation mode");
     }
 
-    _border_handler.configure(input, _scale_kernel.border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(input, _scale_kernel.border_size(), border_mode, constant_border_value);
 }
 
 Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy,
                          BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
+    ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER && sampling_policy != SamplingPolicy::TOP_LEFT);
     ARM_COMPUTE_UNUSED(border_mode, constant_border_value);
 
     ITensorInfo *offsets = nullptr;

diff --git a/src/runtime/NEON/functions/NESelect.cpp b/src/runtime/NEON/functions/NESelect.cpp
new file mode 100644
index 0000000..509bbaa
--- /dev/null
+++ b/src/runtime/NEON/functions/NESelect.cpp

@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESelect.h"
+
+#include "arm_compute/core/NEON/kernels/NESelectKernel.h"
+#include "arm_compute/core/Types.h"
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+void NESelect::configure(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NESelectKernel>();
+    k->configure(c, x, y, output);
+    _kernel = std::move(k);
+}
+
+Status NESelect::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+{
+    return NESelectKernel::validate(c, x, y, output);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NESlice.cpp b/src/runtime/NEON/functions/NESlice.cpp
new file mode 100644
index 0000000..03c2053
--- /dev/null
+++ b/src/runtime/NEON/functions/NESlice.cpp

@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESlice.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NESlice::configure(const ITensor *input, ITensor *output, const Coordinates &starts, const Coordinates &ends)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+    // Get absolute end coordinates
+    const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
+
+    auto k = arm_compute::support::cpp14::make_unique<NEStridedSliceKernel>();
+    k->configure(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
+    _kernel = std::move(k);
+}
+
+Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+
+    // Check start dimensions for being non-negative
+    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i)
+    {
+        return i < 0;
+    }));
+
+    // Get absolute end coordinates
+    const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
+
+    return NEStridedSliceKernel::validate(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 9be9e68..36b7d47 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,54 +25,155 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "utils/TypePrinter.h"
 
 #include <cfloat>
 
-using namespace arm_compute;
-
-NESoftmaxLayer::NESoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _fill_border_kernel(), _max(), _tmp()
+namespace arm_compute
 {
+NESoftmaxLayer::NESoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _flat_or_reshape_kernel_ptr(nullptr), _fill_border_kernel(), _reshape_kernel(), _max(), _tmp(), _input_flattened(),
+      _output_flattened(), _needs_flattening(false)
+{
+}
+
+void NESoftmaxLayer::configure_reshape_input_kernel(const ITensor *input, const ITensor *output, size_t axis)
+{
+    // Flatten the input
+    const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), axis);
+
+    // Initialize the flat input
+    _input_flattened.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
+
+    // If we need to flatten the input, we can use NEFlattenKernel or NEReshapeKernel
+    // If flattening on the third axes, we use NEFlattenKernel.
+    // In all other cases we have to use NEReshapeKernel
+    if(axis != 3)
+    {
+        auto reshape_kernel_ptr = support::cpp14::make_unique<NEReshapeLayerKernel>();
+        reshape_kernel_ptr->configure(input, &_input_flattened);
+        _flat_or_reshape_kernel_ptr = std::move(reshape_kernel_ptr);
+    }
+    else
+    {
+        auto flatten_kernel_ptr = support::cpp14::make_unique<NEFlattenLayerKernel>();
+        flatten_kernel_ptr->configure(input, &_input_flattened);
+        _flat_or_reshape_kernel_ptr = std::move(flatten_kernel_ptr);
+    }
+
+    // We need to init the output tensor here. Indeed, the reshape kernel expects
+    // both tensors to be already initialized
+    auto_init_if_empty(*output->info(), *input->info()->clone());
 }
 
 void NESoftmaxLayer::configure(ITensor *input, ITensor *output, float beta, size_t axis)
 {
+    // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_UNUSED(axis);
+    ARM_COMPUTE_ERROR_THROW_ON(NESoftmaxLayer::validate(input->info(), output->info(), beta, axis));
 
-    // Configure Kernels
-    _max_kernel.configure(input, &_max);
-    _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::REPLICATE);
-    _softmax_kernel.configure(input, &_max, output, beta, &_tmp);
+    // We don't need flattening only in the case the input is 2D and axis is 1
+    _needs_flattening = axis != 1;
+
+    // If we are dealing with a 4D tensor, we will:
+    // - Flatten the input, so that we end up with a [width*height*depth] * batches 2D tensor
+    // - Execute all the pipeline (reduction + normalization) on the flattened tensor
+    // - Reshape the flattened output into the real output
+    if(_needs_flattening)
+    {
+        // Add to the memory manager _input_flattened
+        _memory_group.manage(&_input_flattened);
+
+        // Configure  _flatten_kernel and _input_flattened
+        configure_reshape_input_kernel(input, output, axis);
+    }
+
+    // We want to deal with a 2D input. Either it is the flattened version of the original input (4D case)
+    // or it is the original input case (2D case)
+    ITensor *input_2D = (_needs_flattening ? &_input_flattened : input);
+
+    // Create intermediate tensors shapes
+    const TensorInfo input_info    = input_2D->info()->clone()->reset_padding().set_is_resizable(true);
+    DataType         tmp_data_type = is_data_type_quantized_asymmetric(input_2D->info()->data_type()) ? DataType::F32 : input_2D->info()->data_type();
+    TensorInfo       tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
 
     // Init intermediate tensors
-    _max.allocator()->init(*_max.info());
-    _tmp.allocator()->init(*_tmp.info());
+    TensorShape max_sum_shape = input_2D->info()->tensor_shape();
+    max_sum_shape.set(0, 1);
+    _max.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape));
+    _tmp.allocator()->init(tensor_info_tmp);
 
     // Manage intermediate buffers
     _memory_group.manage(&_max);
     _memory_group.manage(&_tmp);
 
-    // Allocate intermediate tensors
+    // Configure Kernels
+    _max_kernel.configure(input_2D, &_max);
+    if(_needs_flattening)
+    {
+        // Add to the memory manager _output_flattened
+        _memory_group.manage(&_output_flattened);
+
+        // The normalization kernel stores the result in a flat output tensor
+        _softmax_kernel.configure(input_2D, &_max, &_output_flattened, beta, &_tmp);
+        _input_flattened.allocator()->allocate();
+
+        // Reshape the flat output into the requested (4D) output
+        _reshape_kernel.configure(&_output_flattened, output);
+
+        // Allocate the intermediate flat tensors
+        _output_flattened.allocator()->allocate();
+    }
+    else
+    {
+        // Softmax 2D case
+        _fill_border_kernel.configure(input_2D, _max_kernel.border_size(), BorderMode::REPLICATE);
+        _softmax_kernel.configure(input_2D, &_max, output, beta, &_tmp);
+    }
+
+    // Allocate intermediate buffers
     _max.allocator()->allocate();
     _tmp.allocator()->allocate();
 }
 
 Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 1, "Axis must be 1 for NEON");
-
     // Perform validation step
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 2, "Only 2D inputs are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported");
+    ARM_COMPUTE_UNUSED(beta);
+    ARM_COMPUTE_RETURN_ERROR_ON(axis < 1 || input->num_dimensions() < axis);
 
-    const TensorShape max_shape           = TensorShape(input->tensor_shape()).set(0, 1);
-    const TensorInfo  tensor_info_max_sum = TensorInfo(*input).set_tensor_shape(max_shape).reset_padding();
-    const TensorInfo  dont_care;
+    // Create intermediate tensor info
+    DataType         tmp_data_type = input->data_type();
+    const TensorInfo tensor_info_tmp(input->clone()->set_data_type(tmp_data_type).set_is_resizable(true));
+
+    TensorShape max_sum_shape = input->tensor_shape();
+    max_sum_shape.set(0, 1);
+    const TensorInfo tensor_info_max_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(input->quantization_info()).set_is_resizable(true));
+    const TensorInfo dont_care;
+
+    const bool needs_flattening = (axis != 1);
+
+    if(needs_flattening)
+    {
+        const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input, axis);
+        TensorInfo        tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
+
+        if(axis != 3)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(input, &tensor_info_flat));
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &tensor_info_flat));
+        }
+    }
 
     ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DMaxKernel::validate(input, &tensor_info_max_sum));
-    ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DSoftmaxKernel::validate(input, &tensor_info_max_sum, output, beta, &dont_care));
+    ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DSoftmaxKernel::validate(&tensor_info_tmp, &tensor_info_max_sum, output, beta, &dont_care));
 
     return Status{};
 }
@@ -81,9 +182,20 @@
 {
     _memory_group.acquire();
 
+    if(_needs_flattening)
+    {
+        NEScheduler::get().schedule(_flat_or_reshape_kernel_ptr.get(), Window::DimY);
+    }
+
     NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
     NEScheduler::get().schedule(&_max_kernel, Window::DimY);
     NEScheduler::get().schedule(&_softmax_kernel, Window::DimY);
 
+    if(_needs_flattening)
+    {
+        NEScheduler::get().schedule(&_reshape_kernel, Window::DimY);
+    }
+
     _memory_group.release();
 }
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NESplit.cpp b/src/runtime/NEON/functions/NESplit.cpp
new file mode 100644
index 0000000..e947657
--- /dev/null
+++ b/src/runtime/NEON/functions/NESplit.cpp

@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESplit.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+NESplit::NESplit()
+    : _outputs_vector(), _slice_functions(), _num_outputs(0)
+{
+}
+
+void NESplit::configure(const ITensor *input, const std::vector<ITensor *> &outputs, unsigned int axis)
+{
+    // Create Slice functions
+    _num_outputs     = outputs.size();
+    _slice_functions = arm_compute::support::cpp14::make_unique<NESlice[]>(_num_outputs);
+
+    // Get output shape
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_split_shape(input->info(), axis, _num_outputs);
+
+    // Extract output tensor info
+    std::vector<ITensorInfo *> outputs_info;
+    for(auto &output : outputs)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+        outputs_info.emplace_back(output->info());
+    }
+
+    // Validate
+    ARM_COMPUTE_ERROR_THROW_ON(NESplit::validate(input->info(), outputs_info, axis));
+
+    const size_t axis_split_step = output_shape[axis];
+    unsigned int axis_offset     = 0;
+
+    // Start/End coordinates
+    Coordinates start_coords;
+    Coordinates end_coords;
+    for(unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
+    {
+        end_coords.set(d, -1);
+    }
+
+    for(unsigned int i = 0; i < _num_outputs; i++)
+    {
+        // Update coordinate on axis
+        start_coords.set(axis, axis_offset);
+        end_coords.set(axis, axis_offset + axis_split_step);
+
+        // Configure slice function
+        _slice_functions[i].configure(input, outputs[i], start_coords, end_coords);
+
+        // Set valid region from shape
+        outputs[i]->info()->set_valid_region(ValidRegion(Coordinates(), output_shape));
+
+        // Update axis offset
+        axis_offset += axis_split_step;
+    }
+}
+
+Status NESplit::validate(const ITensorInfo *input, const std::vector<ITensorInfo *> &outputs, unsigned int axis)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON(axis >= input->num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON(outputs.size() < 2);
+
+    // Get output shape
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_split_shape(input, axis, outputs.size());
+    ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() == 0);
+
+    const size_t axis_split_step = output_shape[axis];
+    unsigned int axis_offset     = 0;
+
+    // Start/End coordinates
+    Coordinates start_coords;
+    Coordinates end_coords;
+    for(unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
+    {
+        end_coords.set(d, -1);
+    }
+
+    // Validate output tensors
+    for(const auto &output : outputs)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+        // Output auto inizialitation if not yet initialized
+        TensorInfo tmp_output_info = *output->clone();
+        auto_init_if_empty(tmp_output_info, input->clone()->set_is_resizable(true).set_tensor_shape(output_shape));
+
+        // Update coordinate on axis
+        start_coords.set(axis, axis_offset);
+        end_coords.set(axis, axis_offset + axis_split_step);
+
+        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(input, output, start_coords, end_coords));
+        axis_offset += axis_split_step;
+    }
+
+    return Status{};
+}
+
+void NESplit::run()
+{
+    for(unsigned i = 0; i < _num_outputs; ++i)
+    {
+        _slice_functions[i].run();
+    }
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEStackLayer.cpp b/src/runtime/NEON/functions/NEStackLayer.cpp
new file mode 100644
index 0000000..2f49c22
--- /dev/null
+++ b/src/runtime/NEON/functions/NEStackLayer.cpp

@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEStackLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "support/ToolchainSupport.h"
+namespace arm_compute
+{
+NEStackLayer::NEStackLayer() // NOLINT
+    : _input(),
+      _stack_kernels(),
+      _num_inputs(0)
+{
+}
+
+void NEStackLayer::configure(const std::vector<ITensor *> &input, int axis, ITensor *output)
+{
+    _num_inputs    = input.size();
+    _stack_kernels = arm_compute::support::cpp14::make_unique<NEStackLayerKernel[]>(_num_inputs);
+
+    // Wrap around negative values
+    const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
+
+    for(unsigned int i = 0; i < _num_inputs; i++)
+    {
+        _stack_kernels[i].configure(input[i], axis_u, i, _num_inputs, output);
+    }
+}
+
+Status NEStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input.empty());
+
+    // Wrap around negative values
+    const size_t       rank   = input[0]->num_dimensions();
+    const unsigned int axis_u = wrap_around(axis, static_cast<int>(rank + 1));
+
+    const unsigned int num_inputs = input.size();
+
+    for(unsigned int i = 0; i < num_inputs; i++)
+    {
+        // All the tensors must have the same rank
+        ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
+        // Validate Kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEStackLayerKernel::validate(input[i], axis_u, i, num_inputs, output));
+    }
+
+    return Status{};
+}
+
+void NEStackLayer::run()
+{
+    for(unsigned i = 0; i < _num_inputs; i++)
+    {
+        NEScheduler::get().schedule(&_stack_kernels[i], Window::DimY);
+    }
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEStridedSlice.cpp b/src/runtime/NEON/functions/NEStridedSlice.cpp
new file mode 100644
index 0000000..53eb2b0
--- /dev/null
+++ b/src/runtime/NEON/functions/NEStridedSlice.cpp

@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEStridedSlice.h"
+
+#include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NEStridedSlice::configure(const ITensor *input, ITensor *output,
+                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEStridedSliceKernel>();
+    k->configure(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    _kernel = std::move(k);
+}
+
+Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    return NEStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NETile.cpp b/src/runtime/NEON/functions/NETile.cpp
new file mode 100644
index 0000000..0ca4413
--- /dev/null
+++ b/src/runtime/NEON/functions/NETile.cpp

@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NETile.h"
+
+#include "arm_compute/core/NEON/kernels/NETileKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NETile::configure(const ITensor *input, ITensor *output, const Multiples &multiples)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NETileKernel>();
+    k->configure(input, output, multiples);
+    _kernel = std::move(k);
+}
+
+Status NETile::validate(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples)
+{
+    return NETileKernel::validate(input, output, multiples);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEUnstack.cpp b/src/runtime/NEON/functions/NEUnstack.cpp
new file mode 100644
index 0000000..7532020
--- /dev/null
+++ b/src/runtime/NEON/functions/NEUnstack.cpp

@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEUnstack.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor)
+{
+    return wrap_around(axis, static_cast<int>(tensor->num_dimensions()));
+}
+
+inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions)
+{
+    // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time.
+    Coordinates slice_end;
+    slice_start.set_num_dimensions(input_num_dimensions);
+    slice_end.set_num_dimensions(input_num_dimensions);
+    for(size_t k = 0; k < input_num_dimensions; ++k)
+    {
+        slice_start.set(k, 0);
+        slice_end.set(k, -1);
+    }
+    slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(slice_end);
+}
+} // namespace
+
+NEUnstack::NEUnstack() // NOLINT
+    : _num_slices(0),
+      _strided_slice_vector()
+{
+}
+
+void NEUnstack::configure(const ITensor *input, const std::vector<ITensor *> &output_vector, int axis)
+{
+    std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
+    std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ITensor * t)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+        return t->info();
+    });
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_ERROR_THROW_ON(NEUnstack::validate(input->info(), outputs_vector_info, axis));
+
+    // Wrap around negative values
+    const unsigned int axis_u = wrap_axis(axis, input->info());
+    _num_slices               = std::min(outputs_vector_info.size(), input->info()->dimension(axis_u));
+    _strided_slice_vector     = arm_compute::support::cpp14::make_unique<NEStridedSlice[]>(_num_slices);
+
+    Coordinates slice_start;
+    int32_t     slice_end_mask;
+    setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions());
+    for(unsigned int slice = 0; slice < _num_slices; ++slice)
+    {
+        // Adjusts start and end coordinates to take a 2D slice at a time
+        slice_start.set(axis_u, slice);
+        _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+    }
+}
+
+Status NEUnstack::validate(const ITensorInfo *input, const std::vector<ITensorInfo *> &output_vector, int axis)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_vector.empty());
+    ARM_COMPUTE_RETURN_ERROR_ON(axis < (-static_cast<int>(input->tensor_shape().num_dimensions())));
+    ARM_COMPUTE_RETURN_ERROR_ON(axis >= static_cast<int>(input->tensor_shape().num_dimensions()));
+
+    const unsigned int num_slices = std::min(output_vector.size(), input->dimension(wrap_axis(axis, input)));
+    ARM_COMPUTE_RETURN_ERROR_ON(num_slices > input->dimension(wrap_axis(axis, input)));
+    ARM_COMPUTE_RETURN_ERROR_ON(num_slices > output_vector.size());
+
+    Coordinates slice_start;
+    int32_t     slice_end_mask;
+    for(size_t k = 0; k < num_slices; ++k)
+    {
+        slice_start.set(wrap_axis(axis, input), k);
+        setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions());
+        ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input))));
+    }
+    return Status{};
+}
+
+void NEUnstack::run()
+{
+    for(unsigned i = 0; i < _num_slices; ++i)
+    {
+        _strided_slice_vector[i].run();
+    }
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp b/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
index 097605c..7e435c3 100644
--- a/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -66,7 +66,7 @@
     _num_inputs = inputs_vector.size();
 
     std::vector<ITensorInfo *> inputs_vector_info;
-    for(unsigned int i = 0; i < _num_inputs; i++)
+    for(unsigned int i = 0; i < _num_inputs; ++i)
     {
         inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
     }
@@ -80,7 +80,7 @@
 
     _concat_kernels_vector = arm_compute::support::cpp14::make_unique<NEWidthConcatenateLayerKernel[]>(_num_inputs);
 
-    for(unsigned int i = 0; i < _num_inputs; i++)
+    for(unsigned int i = 0; i < _num_inputs; ++i)
     {
         _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
         width_offset += inputs_vector.at(i)->info()->dimension(0);
@@ -89,7 +89,7 @@
 
 void NEWidthConcatenateLayer::run()
 {
-    for(unsigned i = 0; i < _num_inputs; i++)
+    for(unsigned i = 0; i < _num_inputs; ++i)
     {
         NEScheduler::get().schedule(_concat_kernels_vector.get() + i, Window::DimY);
     }

diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index c8e3b3b..e37f8ab 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp

@@ -464,6 +464,7 @@
         transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
 
         //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
+        _memory_group.manage(&_output_nhwc);
         transform_output_kernel->configure(biases, &_output_workspace,
                                            output_matrix_stride, &_output_nhwc,
                                            in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
@@ -483,16 +484,16 @@
                                            in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
     }
 
-    _weights_hwio.allocator()->allocate();
     _gemm_function.configure(&_input_workspace, &_kernel_storage, nullptr, &_output_workspace, 1.0f, 0.f);
     _input_workspace.allocator()->allocate();
-    _kernel_storage.allocator()->allocate();
     _output_workspace.allocator()->allocate();
 
     // Reorder the convoluted output to ACL's ordering NCHW
-    _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
-
-    _output_nhwc.allocator()->allocate();
+    if(data_layout == DataLayout::NCHW)
+    {
+        _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
+        _output_nhwc.allocator()->allocate();
+    }
 
     _transform_input_kernel   = std::move(transform_input_kernel);
     _transform_weights_kernel = std::move(transform_weights_kernel);
@@ -656,10 +657,12 @@
     if(!_is_prepared)
     {
         // Permute weights
+        _weights_hwio.allocator()->allocate();
         _permute_weights.run();
         _weights->mark_as_unused();
 
         // Transform weights
+        _kernel_storage.allocator()->allocate();
         NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
 
         _weights_hwio.allocator()->free();

diff --git a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
index c87e82a..34aaea0 100644
--- a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
+++ b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,18 +26,159 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/assembly/Helpers.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h"
+
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+
 namespace arm_compute
 {
+#ifndef NO_MULTI_THREADING
+class BufferManagerMultipleThreads final : public IBufferManager
+{
+public:
+    /** Number of buffers to ping pong between */
+    static constexpr unsigned int NUM_BUFFERS = 3;
+
+    explicit BufferManagerMultipleThreads(unsigned int max_num_users)
+        : _max_num_users(max_num_users)
+    {
+    }
+    unsigned int num_buffers() const override
+    {
+        return NUM_BUFFERS;
+    }
+    /* - Lock the requested index if it's free and return true if it needs reshaping.
+     * - Return false without acquiring the lock if the buffer at the index is already reshaped / being reshaped.
+     * - Block if the corresponding buffer for the given index is still being used by a different index.
+     */
+    bool lock_to_reshape_if_needed(unsigned int index) override
+    {
+        Buffer &buf = get_buffer_from_index(index);
+        while(true)
+        {
+            if(buf.index == index && buf.state != State::FREE)
+            {
+                //Another thread already is reshaping / has reshaped this block: nothing to do
+                return false;
+            }
+            else
+            {
+                std::unique_lock<std::mutex> lock(buf.mutex);
+                //If the buffer is free then lock it for reshaping:
+                if(buf.state == State::FREE)
+                {
+                    buf.index = index;
+                    buf.state = State::BEING_RESHAPED;
+                    return true;
+                }
+                // Check again just in case it changed while we were acquiring the lock:
+                if(buf.index == index)
+                {
+                    //Another thread is reshaping this block already, nothing to do
+                    return false;
+                }
+                // buf.index != index: Buffer still being used by another block, need to wait
+                buf.sem.wait(lock);
+            }
+        }
+    }
+    /* Mark the buffer at the given index as reshaped and release the lock acquired via lock_to_reshape_if_needed() */
+    void mark_as_reshaped(unsigned int index) override
+    {
+        Buffer &buf = get_buffer_from_index(index);
+        {
+            std::lock_guard<std::mutex> lock(buf.mutex);
+            buf.users = _max_num_users;
+            buf.state = State::IN_USE;
+        }
+        buf.sem.notify_all();
+    }
+
+    /* Block until the buffer at the given index is reshaped */
+    void wait_for_reshaping(unsigned int index) override
+    {
+        Buffer &buf = get_buffer_from_index(index);
+        ARM_COMPUTE_ERROR_ON(buf.index != index); // Should have blocked in lock_to_reshape_if_needed()
+        // Check if it's already ready to use:
+        if(buf.state == State::IN_USE)
+            return;
+        std::unique_lock<std::mutex> lock(buf.mutex);
+        //Double check it didn't change while we were acquiring the lock:
+        if(buf.state == State::IN_USE)
+            return;
+        buf.sem.wait(lock);
+    }
+    /* Mark the buffer at the given index as not used by this thread anymore.
+     * Once all the threads have called this method then the buffer is marked as free again.
+     */
+    void mark_as_unused(unsigned int index) override
+    {
+        Buffer &buf = get_buffer_from_index(index);
+        ARM_COMPUTE_ERROR_ON(buf.index != index); // Should have blocked in lock_to_reshape_if_needed()
+        if(--buf.users == 0)
+        {
+            std::unique_lock<std::mutex> lock(buf.mutex);
+            buf.state = State::FREE;
+            lock.unlock();
+            buf.sem.notify_all();
+        }
+    }
+
+private:
+    enum class State
+    {
+        FREE,
+        BEING_RESHAPED,
+        IN_USE
+    };
+    struct Buffer
+    {
+        unsigned int            index{};
+        std::atomic_uint        users{};
+        State                   state{ State::FREE };
+        std::mutex              mutex{};
+        std::condition_variable sem{};
+    } _buffers[NUM_BUFFERS];
+    Buffer &get_buffer_from_index(unsigned int index)
+    {
+        return _buffers[index % NUM_BUFFERS];
+    }
+    unsigned int _max_num_users;
+};
+#endif /* NO_MULTI_THREADING */
+
+class BufferManagerSingleThread : public IBufferManager
+{
+public:
+    unsigned int num_buffers() const override
+    {
+        return 1;
+    }
+    bool lock_to_reshape_if_needed(unsigned int index) override
+    {
+        return true;
+    }
+    void mark_as_reshaped(unsigned int index) override
+    {
+    }
+    void wait_for_reshaping(unsigned int index) override
+    {
+    }
+    void mark_as_unused(unsigned int index) override
+    {
+    }
+};
+
 NEGEMMInterleavedWrapper::NEGEMMInterleavedWrapper(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager))
 {
 }
+
 void NEGEMMInterleavedWrapper::run()
 {
     prepare();
@@ -53,6 +194,7 @@
     {
         if(_pretranspose_b)
         {
+            _transformed_b.allocator()->allocate();
             NEScheduler::get().schedule(_prepare_b.get(), Window::DimX);
             _b->mark_as_unused();
         }
@@ -65,12 +207,13 @@
 
         //Maximum number of workloads to create:
         const unsigned int num_threads    = NEScheduler::get().num_threads();
-        const unsigned int max_iterations = num_threads == 1 ? 1 : num_threads;
+        const unsigned int max_iterations = std::max(num_threads, _num_windows);
         //Maximum number of iterations the parameters allow:
         const unsigned int num_iterations = _batch_window.num_iterations_total();
         // Keep the smallest of the two:
         const unsigned int num_windows  = std::min(num_iterations, max_iterations);
         const TensorShape  window_shape = _batch_window.shape();
+        const unsigned int num_x_blocks = _block_walker.num_iterations(Window::DimX);
 
         // Create a 1D window to dynamically split the batch window:
         Window win_1D;
@@ -79,66 +222,119 @@
         // Create one workload for each sub-window:
         for(unsigned int w = 0; w < num_windows; w++)
         {
-            Window             win          = win_1D.split_window(0, w, num_windows);
-            const Coordinates  start_offset = index2coords(window_shape, win.x().start());
-            const Coordinates  end_offset   = index2coords(window_shape, win.x().end() - 1);
-            const unsigned int num_x_blocks = _block_walker.num_iterations(Window::DimX);
+            Window            win          = win_1D.split_window(0, w, num_windows);
+            const Coordinates start_offset = index2coords(window_shape, win.x().start());
+            const Coordinates end_offset   = index2coords(window_shape, win.x().end() - 1);
 
-            auto workload = [start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
+            if(_pretranspose_b)
             {
-                //For each block of rows in "M"
-                auto workload_mm = this->_mm_workloads.begin();
-                for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++)
+                auto workload = [start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
                 {
-                    // Transform one k_block from A:
-                    this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset);
-                    // Then perform the matrix multiplication for each x block along N:
-                    for(unsigned int i = 0; i < num_x_blocks; i++)
+                    //For each block of rows in "M"
+                    auto workload_mm = this->_mm_workloads.begin();
+                    for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++)
                     {
-                        ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
-                        this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
+                        // Transform one k_block from A:
+                        this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset);
+                        // Then perform the matrix multiplication for each x block along N:
+                        for(unsigned int i = 0; i < num_x_blocks; i++)
+                        {
+                            ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
+                            this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
+                        }
                     }
-                }
-            };
-            _workloads.push_back(workload);
+                };
+                _workloads.push_back(workload);
+            }
+            else
+            {
+                auto workload = [num_threads, start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
+                {
+                    //For each block of rows in "M"
+                    auto         workload_mm = this->_mm_workloads.begin();
+                    unsigned int workload_b  = 0;
+                    //If there is only one thread then only reshape the B blocks as you need them:
+                    unsigned int workload_b_next = num_threads == 1 ? this->_b_workloads.size() : 1;
+
+                    for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++)
+                    {
+                        // Transform one k_block from A:
+                        this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset);
+                        // Then perform the matrix multiplication for each x block along N:
+                        for(unsigned int i = 0; i < num_x_blocks; i++)
+                        {
+                            ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
+                            if(workload_b_next < this->_b_workloads.size())
+                            {
+                                //Lock on BufferManager: need to run it ?
+                                if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b_next))
+                                {
+                                    this->_prepare_b->transform(this->_b_workloads[workload_b_next], info);
+                                    this->_buffer_manager->mark_as_reshaped(workload_b_next);
+                                }
+                                workload_b_next++;
+                            }
+                            ARM_COMPUTE_ERROR_ON(workload_b >= this->_b_workloads.size());
+                            // Run if needed or wait
+                            if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b))
+                            {
+                                this->_prepare_b->transform(this->_b_workloads[workload_b], info);
+                                this->_buffer_manager->mark_as_reshaped(workload_b);
+                            }
+                            this->_buffer_manager->wait_for_reshaping(workload_b);
+                            this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
+                            this->_buffer_manager->mark_as_unused(workload_b);
+                            workload_b++;
+                        }
+                    }
+                };
+                _workloads.push_back(workload);
+            }
+        }
+        if(!_pretranspose_b && num_windows > 1 && num_windows % num_threads != 0)
+        {
+            //Make sure the number of workloads is a multiple of the number of threads to avoid dead locks:
+            for(unsigned int leftover = num_windows % num_threads; leftover != num_threads; leftover++)
+            {
+                auto workload = [this](const ThreadInfo & info)
+                {
+                    unsigned int workload_b = 0;
+                    //If there is only one thread then only reshape the B blocks as you need them:
+                    unsigned int workload_b_next = 1;
+
+                    for(unsigned int iteration = 0; iteration < this->_mm_workloads.size(); iteration++)
+                    {
+                        if(workload_b_next < this->_b_workloads.size())
+                        {
+                            //Lock on BufferManager: need to run it ?
+                            if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b_next))
+                            {
+                                this->_prepare_b->transform(this->_b_workloads[workload_b_next], info);
+                                this->_buffer_manager->mark_as_reshaped(workload_b_next);
+                            }
+                            workload_b_next++;
+                        }
+                        ARM_COMPUTE_ERROR_ON(workload_b >= this->_b_workloads.size());
+                        // Run if needed or wait
+                        if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b))
+                        {
+                            this->_prepare_b->transform(this->_b_workloads[workload_b], info);
+                            this->_buffer_manager->mark_as_reshaped(workload_b);
+                        }
+                        this->_buffer_manager->wait_for_reshaping(workload_b);
+                        this->_buffer_manager->mark_as_unused(workload_b);
+                        workload_b++;
+                    }
+                };
+                _workloads.push_back(workload);
+            }
         }
 
         _is_prepared = true;
     }
 }
 
-namespace
-{
-// Factory to instantiate NEGEMMInterleavedPrepareBWrapperKernel:
-template <typename InputType, bool use_dot = false>
-std::unique_ptr<NEGEMMInterleavedPrepareBWrapperKernel> instantiate_prepareB(const ITensor *b, ITensor *transformed_b, const INEGEMMWrapperKernel::Params &params)
-{
-    auto prepare_b = support::cpp14::make_unique<NEGEMMInterleavedPrepareBWrapperKernelTemplate<InputType, use_dot>>();
-    prepare_b->configure(b, transformed_b, false, NEScheduler::get().cpu_info(), params);
-    return std::move(prepare_b);
-}
-
-// Factory to instantiate NEGEMMInterleavedTransformAWrapperTemplate:
-template <typename InputType, bool use_dot = false>
-std::unique_ptr<NEGEMMInterleavedTransformAWrapper> instantiate_transformA(const ITensor *a, ITensor *transformed_a, const Window &block_walker, const INEGEMMWrapperKernel::Params &params)
-{
-    auto transform_a = support::cpp14::make_unique<NEGEMMInterleavedTransformAWrapperTemplate<InputType, use_dot>>();
-    transform_a->configure(a, transformed_a, false, block_walker, params);
-    return std::move(transform_a);
-}
-
-// Factory to instantiate NEGEMMInterleavedTransformAWrapperTemplate:
-template <typename InputType, typename OutputType, bool use_dot = false>
-std::unique_ptr<NEGEMMInterleavedMatrixMultiplyWrapper> instantiate_matrix_multiply(const ITensor *transformed_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &block_walker,
-                                                                                    const BlockSizes &block_sizes, const INEGEMMWrapperKernel::Params &params, bool pretranspose_b, float alpha, float beta)
-{
-    auto matrix_multiply = support::cpp14::make_unique<NEGEMMInterleavedMatrixMultiplyWrapperTemplate<InputType, OutputType, use_dot>>();
-    matrix_multiply->configure(transformed_a, transformed_b, tmp_c, c, block_walker, block_sizes, params, pretranspose_b, alpha, beta, NEScheduler::get().num_threads());
-    return std::move(matrix_multiply);
-}
-} // namespace
-
-void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, bool pretranspose_b, bool use_dot)
+void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, bool pretranspose_b)
 {
     _params         = INEGEMMWrapperKernel::extract_parameters(a, b, c);
     _a              = a;
@@ -146,124 +342,80 @@
     _c              = c;
     _pretranspose_b = pretranspose_b;
 
-    DataType input_type = a->info()->data_type();
+    const DataType     input_type  = a->info()->data_type();
+    const CPUInfo     &ci          = NEScheduler::get().cpu_info();
+    const unsigned int num_threads = NEScheduler::get().num_threads();
+
+    const arm_gemm::KernelDescription gemm_kernel_info = get_gemm_info(input_type, ci, num_threads, _params, alpha, beta, pretranspose_b);
+    ARM_COMPUTE_ERROR_ON(gemm_kernel_info.method != arm_gemm::GemmMethod::GEMM_INTERLEAVED);
 
     // Forcing 128-byte alignment (required by 32-bit kernels)
     const unsigned int alignment = 128;
     _transformed_b.allocator()->init(TensorInfo{}, alignment);
     _tmp_c.allocator()->init(TensorInfo{}, alignment);
-    _tag = "NEGEMMInterleaved_";
-    _tag += get_strategy_name(input_type, use_dot);
+    _tag = "NEGEMMInterleaved_" + gemm_kernel_info.name;
+
+    // Get strategy
+    std::unique_ptr<detail::IInterleavedStrategy> strategy = detail::create_strategy(gemm_kernel_info.name);
+    _num_windows                                           = iceildiv(_params.M, strategy->out_height()) * _params.batches;
+    ARM_COMPUTE_ERROR_ON(strategy == nullptr);
 
     if(!_pretranspose_b)
     {
+        _block_sizes = strategy->calculate_block_sizes_for_strategy(ci, _params);
+        _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
+        _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
+        // If the execution is single threaded or has only one window then the buffer manager only needs 1 buffer else we will use NUM_BUFFERS buffers and ping pong between them:
+        const unsigned int num_iterations = _batch_window.num_iterations_total();
+        if(NEScheduler::get().num_threads() == 1 || num_iterations == 1)
+        {
+            _buffer_manager = support::cpp14::make_unique<BufferManagerSingleThread>();
+        }
+        else
+        {
+#ifdef NO_MULTI_THREADING
+            ARM_COMPUTE_ERROR("Can't have more than 1 buffer without multiple threads");
+#else  /* NO_MULTI_THREADING */
+            _buffer_manager = support::cpp14::make_unique<BufferManagerMultipleThreads>(NEScheduler::get().num_threads());
+#endif /* NO_MULTI_THREADING */
+        }
         // If B is transposed at every iteration then transformed_B can be managed:
         _memory_group.manage(&_transformed_b);
-        _block_sizes = calculate_block_sizes_from_data_type(NEScheduler::get().cpu_info(), _params.M, _params.N, _params.K, input_type, use_dot);
+        auto_init_if_empty(*_transformed_b.info(), _b->info()->clone()->set_tensor_shape(TensorShape(_block_sizes.x_block * _block_sizes.k_block, _buffer_manager->num_buffers())));
     }
     else
     {
         _tag += "_preB";
-        switch(input_type)
-        {
-            case DataType::F32:
-                _prepare_b = instantiate_prepareB<float>(_b, &_transformed_b, _params);
-                break;
-#ifdef __aarch64__
-            case DataType::U8:
-            case DataType::QASYMM8:
-                if(use_dot)
-                {
-                    _prepare_b = instantiate_prepareB<uint8_t, true>(_b, &_transformed_b, _params);
-                }
-                else
-                {
-                    _prepare_b = instantiate_prepareB<uint8_t, false>(_b, &_transformed_b, _params);
-                }
-                break;
-            case DataType::S8:
-                if(use_dot)
-                {
-                    _prepare_b = instantiate_prepareB<int8_t, true>(_b, &_transformed_b, _params);
-                }
-                else
-                {
-                    _prepare_b = instantiate_prepareB<int8_t, false>(_b, &_transformed_b, _params);
-                }
-                break;
-#endif /* __aarch64__ */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-                _prepare_b = instantiate_prepareB<__fp16>(_b, &_transformed_b, _params);
-                break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            default:
-                ARM_COMPUTE_ERROR("DataType not supported");
-                break;
-        }
-        ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr);
+    }
 
+    _prepare_b = strategy->instantiate_prepareB(b, &_transformed_b, _params, ci);
+    ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr);
+
+    if(_pretranspose_b)
+    {
         _block_sizes = _prepare_b->block_sizes();
+        _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
+        _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
     }
 
     _block_walker.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_params.N, _block_sizes.x_block), _block_sizes.x_block));
     _block_walker.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(_params.K, _block_sizes.k_block), _block_sizes.k_block));
     _block_walker.set(Window::DimZ, Window::Dimension(0, _params.multis));
 
-    _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
-    _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
-
     _transformed_a.allocator()->init(TensorInfo(TensorShape{ _block_sizes.k_block, _block_sizes.m_round, _params.batches }, 1, input_type), alignment);
     _memory_group.manage(&_transformed_a);
     _memory_group.manage(&_tmp_c);
 
-    switch(input_type)
-    {
-        case DataType::F32:
-            _transform_a     = instantiate_transformA<float>(_a, &_transformed_a, _block_walker, _params);
-            _matrix_multiply = instantiate_matrix_multiply<float, float>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
-            break;
-#ifdef __aarch64__
-        case DataType::U8:
-        case DataType::QASYMM8:
-            if(use_dot)
-            {
-                _transform_a     = instantiate_transformA<uint8_t, true>(_a, &_transformed_a, _block_walker, _params);
-                _matrix_multiply = instantiate_matrix_multiply<uint8_t, uint32_t, true>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
-            }
-            else
-            {
-                _transform_a     = instantiate_transformA<uint8_t, false>(_a, &_transformed_a, _block_walker, _params);
-                _matrix_multiply = instantiate_matrix_multiply<uint8_t, uint32_t, false>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
-            }
-            break;
-        case DataType::S8:
-            if(use_dot)
-            {
-                _transform_a     = instantiate_transformA<int8_t, true>(_a, &_transformed_a, _block_walker, _params);
-                _matrix_multiply = instantiate_matrix_multiply<int8_t, int32_t, true>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
-            }
-            else
-            {
-                _transform_a     = instantiate_transformA<int8_t, false>(_a, &_transformed_a, _block_walker, _params);
-                _matrix_multiply = instantiate_matrix_multiply<int8_t, int32_t, false>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
-            }
-            break;
-#endif /* __aarch64__ */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            _transform_a     = instantiate_transformA<__fp16>(_a, &_transformed_a, _block_walker, _params);
-            _matrix_multiply = instantiate_matrix_multiply<__fp16, __fp16>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
-            break;
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        default:
-            break;
-    }
+    _transform_a     = strategy->instantiate_transformA(_a, &_transformed_a, _block_walker, _params);
+    _matrix_multiply = strategy->instantiate_matrix_multiply(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, alpha, beta, pretranspose_b, num_threads);
     ARM_COMPUTE_ERROR_ON(_transform_a == nullptr);
     ARM_COMPUTE_ERROR_ON(_matrix_multiply == nullptr);
+
     _transformed_a.allocator()->allocate();
     _tmp_c.allocator()->allocate();
-    _transformed_b.allocator()->allocate();
+    if(!_pretranspose_b)
+    {
+        _transformed_b.allocator()->allocate();
+    }
 }
 } // namespace arm_compute

diff --git a/src/runtime/OffsetLifetimeManager.cpp b/src/runtime/OffsetLifetimeManager.cpp
index d0b3bde..ad23220 100644
--- a/src/runtime/OffsetLifetimeManager.cpp
+++ b/src/runtime/OffsetLifetimeManager.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,8 +34,16 @@
 #include <map>
 #include <vector>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
+namespace
+{
+size_t align_offset(size_t offset, size_t alignment)
+{
+    const size_t remainder = (alignment != 0U) ? offset % alignment : 0U;
+    return (remainder != 0U) ? offset + (alignment - remainder) : offset;
+}
+} // namespace
 OffsetLifetimeManager::OffsetLifetimeManager()
     : _blob(0)
 {
@@ -58,11 +66,15 @@
     ARM_COMPUTE_ERROR_ON(_active_group == nullptr);
 
     // Update blob size
-    size_t max_group_size = std::accumulate(std::begin(_free_blobs), std::end(_free_blobs), static_cast<size_t>(0), [](size_t s, const Blob & b)
+    size_t max_aggregated_size = 0;
+    std::for_each(std::begin(_free_blobs), std::end(_free_blobs), [&](const Blob & b)
     {
-        return s + b.max_size;
+        max_aggregated_size += b.max_size;
+        _blob.alignment = std::max(_blob.alignment, b.max_alignment);
     });
-    _blob = std::max(_blob, max_group_size);
+    max_aggregated_size += _free_blobs.size() * _blob.alignment;
+    _blob.owners = std::max(_blob.owners, _free_blobs.size());
+    _blob.size   = std::max(_blob.size, max_aggregated_size);
 
     // Calculate group mappings
     auto &group_mappings = _active_group->mappings();
@@ -76,6 +88,8 @@
             group_mappings[bound_element.handle] = offset;
         }
         offset += free_blob.max_size;
-        ARM_COMPUTE_ERROR_ON(offset > _blob);
+        offset = align_offset(offset, _blob.alignment);
+        ARM_COMPUTE_ERROR_ON(offset > _blob.size);
     }
 }
+} // namespace arm_compute

diff --git a/src/runtime/OffsetMemoryPool.cpp b/src/runtime/OffsetMemoryPool.cpp
index 36eaf0b..70cbe90 100644
--- a/src/runtime/OffsetMemoryPool.cpp
+++ b/src/runtime/OffsetMemoryPool.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,11 +34,11 @@
 
 using namespace arm_compute;
 
-OffsetMemoryPool::OffsetMemoryPool(IAllocator *allocator, size_t blob_size)
-    : _allocator(allocator), _blob(), _blob_size(blob_size)
+OffsetMemoryPool::OffsetMemoryPool(IAllocator *allocator, BlobInfo blob_info)
+    : _allocator(allocator), _blob(), _blob_info(blob_info)
 {
     ARM_COMPUTE_ERROR_ON(!allocator);
-    _blob = _allocator->make_region(blob_size, 0);
+    _blob = _allocator->make_region(blob_info.size, blob_info.alignment);
 }
 
 void OffsetMemoryPool::acquire(MemoryMappings &handles)
@@ -49,7 +49,7 @@
     for(auto &handle : handles)
     {
         ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
-        handle.first->set_owned_region(_blob->extract_subregion(handle.second, _blob_size - handle.second));
+        handle.first->set_owned_region(_blob->extract_subregion(handle.second, _blob_info.size - handle.second));
     }
 }
 
@@ -70,5 +70,5 @@
 std::unique_ptr<IMemoryPool> OffsetMemoryPool::duplicate()
 {
     ARM_COMPUTE_ERROR_ON(!_allocator);
-    return support::cpp14::make_unique<OffsetMemoryPool>(_allocator, _blob_size);
+    return support::cpp14::make_unique<OffsetMemoryPool>(_allocator, _blob_info);
 }
\ No newline at end of file

diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index 5fa51d7..38edb8b 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp

@@ -138,7 +138,7 @@
     }
     else
     {
-        _associated_memory_group->finalize_memory(_owner, _memory, info().total_size());
+        _associated_memory_group->finalize_memory(_owner, _memory, info().total_size(), alignment());
     }
     info().set_is_resizable(false);
 }
commit	514be65ad8d3340f53fd9591035352ed285811ba	[log] [tgz]
author	Jenkins <bsgcomp@arm.com>	Thu Feb 28 12:25:18 2019 +0000
committer	Anthony Barbier <anthony.barbier@arm.com>	Thu Feb 28 13:38:08 2019 +0000
tree	abe236598d76078a537fd247813e287d5bf34acd
parent	3d2d44ef55ab6b08afda8be48301ce3c55c7bc67 [diff]