arm_compute v18.05

commit: b3a371bc429d2ba45e56baaf239d8200c2662a74 [log] [tgz]
author: Jenkins <bsgcomp@arm.com> Wed May 23 11:36:53 2018 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> Wed May 23 14:55:11 2018 +0100
tree: 554525e415c303d64a08722a755397852ebbb8e4
parent: 67c8c91522e5be8156b77f57e63c0253535c902a [diff]
diff --git a/src/runtime/CL/CLBufferAllocator.cpp b/src/runtime/CL/CLBufferAllocator.cpp
index 9a5c13a..84789e7 100644
--- a/src/runtime/CL/CLBufferAllocator.cpp
+++ b/src/runtime/CL/CLBufferAllocator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,9 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/CLBufferAllocator.h"
+#include "arm_compute/runtime/CL/CLMemoryRegion.h"
 
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
+#include "support/ToolchainSupport.h"
 
 #include <cstddef>
 
@@ -47,3 +49,9 @@
     ARM_COMPUTE_ERROR_ON(ptr == nullptr);
     clReleaseMemObject(static_cast<cl_mem>(ptr));
 }
+
+std::unique_ptr<IMemoryRegion> CLBufferAllocator::make_region(size_t size, size_t alignment)
+{
+    ARM_COMPUTE_UNUSED(alignment);
+    return arm_compute::support::cpp14::make_unique<CLBufferMemoryRegion>(_context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
+}

diff --git a/src/runtime/CL/CLHOG.cpp b/src/runtime/CL/CLHOG.cpp
index 3f5266c..c4ea639 100644
--- a/src/runtime/CL/CLHOG.cpp
+++ b/src/runtime/CL/CLHOG.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -74,11 +74,11 @@
 uint8_t *CLHOG::do_map(cl::CommandQueue &q, bool blocking)
 {
     ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
-    return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->descriptor_size()));
+    return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->descriptor_size() * sizeof(float)));
 }
 
 void CLHOG::do_unmap(cl::CommandQueue &q)
 {
     ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
     q.enqueueUnmapMemObject(_buffer, descriptor());
-}
\ No newline at end of file
+}

diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp
new file mode 100644
index 0000000..534c4f9
--- /dev/null
+++ b/src/runtime/CL/CLMemory.cpp

@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLMemory.h"
+
+#include "arm_compute/core/Error.h"
+
+namespace arm_compute
+{
+CLMemory::CLMemory()
+    : _region(nullptr), _region_owned(nullptr)
+{
+    create_empty_region();
+}
+
+CLMemory::CLMemory(std::shared_ptr<ICLMemoryRegion> memory)
+    : _region(nullptr), _region_owned(std::move(memory))
+{
+    if(_region_owned == nullptr)
+    {
+        create_empty_region();
+    }
+    _region = _region_owned.get();
+}
+
+CLMemory::CLMemory(ICLMemoryRegion *memory)
+    : _region(memory), _region_owned(nullptr)
+{
+    _region = memory;
+}
+
+ICLMemoryRegion *CLMemory::region()
+{
+    return _region;
+}
+
+ICLMemoryRegion *CLMemory::region() const
+{
+    return _region;
+}
+
+void CLMemory::create_empty_region()
+{
+    _region_owned = std::make_shared<CLBufferMemoryRegion>(cl::Context::getDefault(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 0);
+    _region       = _region_owned.get();
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/CL/CLMemoryRegion.cpp b/src/runtime/CL/CLMemoryRegion.cpp
new file mode 100644
index 0000000..15fd7f3
--- /dev/null
+++ b/src/runtime/CL/CLMemoryRegion.cpp

@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLMemoryRegion.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+ICLMemoryRegion::ICLMemoryRegion(cl::Context ctx, size_t size)
+    : IMemoryRegion(size), _ctx(std::move(ctx)), _mapping(nullptr), _mem()
+{
+}
+
+const cl::Buffer &ICLMemoryRegion::cl_data() const
+{
+    return _mem;
+}
+
+void *ICLMemoryRegion::buffer()
+{
+    return _mapping;
+}
+
+void *ICLMemoryRegion::buffer() const
+{
+    return _mapping;
+}
+
+void **ICLMemoryRegion::handle()
+{
+    return reinterpret_cast<void **>(&_mem);
+}
+
+CLBufferMemoryRegion::CLBufferMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size)
+    : ICLMemoryRegion(std::move(ctx), size)
+{
+    if(_size != 0)
+    {
+        _mem = cl::Buffer(_ctx, flags, _size);
+    }
+}
+
+void *CLBufferMemoryRegion::ptr()
+{
+    return nullptr;
+}
+
+void *CLBufferMemoryRegion::map(cl::CommandQueue &q, bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(_mem.get() == nullptr);
+    _mapping = q.enqueueMapBuffer(_mem, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, _size);
+    return _mapping;
+}
+
+void CLBufferMemoryRegion::unmap(cl::CommandQueue &q)
+{
+    ARM_COMPUTE_ERROR_ON(_mem.get() == nullptr);
+    q.enqueueUnmapMemObject(_mem, _mapping);
+    _mapping = nullptr;
+}
+
+ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size, size_t alignment)
+    : ICLMemoryRegion(std::move(ctx), size), _ptr(nullptr)
+{
+    if(size != 0)
+    {
+        _ptr = clSVMAlloc(_ctx.get(), flags, size, alignment);
+        if(_ptr != nullptr)
+        {
+            _mem = cl::Buffer(_ctx, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _size, _ptr);
+        }
+    }
+}
+
+ICLSVMMemoryRegion::~ICLSVMMemoryRegion()
+{
+    if(_ptr != nullptr)
+    {
+        clFinish(CLScheduler::get().queue().get());
+        _mem = cl::Buffer();
+        clSVMFree(_ctx.get(), _ptr);
+    }
+}
+
+void *ICLSVMMemoryRegion::ptr()
+{
+    return _ptr;
+}
+
+CLCoarseSVMMemoryRegion::CLCoarseSVMMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size, size_t alignment)
+    : ICLSVMMemoryRegion(std::move(ctx), flags, size, alignment)
+{
+}
+
+void *CLCoarseSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(_ptr == nullptr);
+    clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr, nullptr);
+    _mapping = _ptr;
+    return _mapping;
+}
+
+void CLCoarseSVMMemoryRegion::unmap(cl::CommandQueue &q)
+{
+    ARM_COMPUTE_ERROR_ON(_ptr == nullptr);
+    clEnqueueSVMUnmap(q.get(), _ptr, 0, nullptr, nullptr);
+    _mapping = nullptr;
+}
+
+CLFineSVMMemoryRegion::CLFineSVMMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size, size_t alignment)
+    : ICLSVMMemoryRegion(std::move(ctx), flags, size, alignment)
+{
+}
+
+void *CLFineSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking)
+{
+    if(blocking)
+    {
+        clFinish(q.get());
+    }
+    _mapping = _ptr;
+    return _mapping;
+}
+
+void CLFineSVMMemoryRegion::unmap(cl::CommandQueue &q)
+{
+    ARM_COMPUTE_UNUSED(q);
+    _mapping = nullptr;
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index 65292fe..fdae615 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp

@@ -31,7 +31,7 @@
 std::once_flag CLScheduler::_initialize_symbols;
 
 CLScheduler::CLScheduler()
-    : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner()
+    : _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner()
 {
 }
 
@@ -52,7 +52,7 @@
     if(_cl_tuner != nullptr)
     {
         // Tune the OpenCL kernel
-        _cl_tuner->tune_kernel(kernel);
+        _cl_tuner->tune_kernel_dynamic(kernel);
     }
 
     // Run kernel

diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp
index 5f58024..d0e7d76 100644
--- a/src/runtime/CL/CLSubTensor.cpp
+++ b/src/runtime/CL/CLSubTensor.cpp

@@ -29,6 +29,11 @@
 
 using namespace arm_compute;
 
+CLSubTensor::CLSubTensor()
+    : _parent(nullptr), _info()
+{
+}
+
 CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent)
     : _parent(nullptr), _info()
 {

diff --git a/src/runtime/CL/CLTensor.cpp b/src/runtime/CL/CLTensor.cpp
index bc513d1..dd27738 100644
--- a/src/runtime/CL/CLTensor.cpp
+++ b/src/runtime/CL/CLTensor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,7 +47,7 @@
     return _allocator.cl_data();
 }
 
-ITensorAllocator *CLTensor::allocator()
+CLTensorAllocator *CLTensor::allocator()
 {
     return &_allocator;
 }

diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index ad165fa..54e7c5b 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,36 +30,57 @@
 
 using namespace arm_compute;
 
-CLTensorAllocator::CLTensorAllocator(CLTensor *owner)
-    : _associated_memory_group(nullptr), _buffer(), _mapping(nullptr), _owner(owner)
+namespace
 {
-}
+std::shared_ptr<arm_compute::ICLMemoryRegion> allocate_region(cl::Context context, size_t size, cl_uint alignment)
+{
+    // Try fine-grain SVM
+    std::shared_ptr<ICLMemoryRegion> region = std::make_shared<CLFineSVMMemoryRegion>(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, size, alignment);
 
-CLTensorAllocator::~CLTensorAllocator()
+    // Try coarse-grain SVM in case of failure
+    if(region != nullptr && region->ptr() == nullptr)
+    {
+        region = std::make_shared<CLCoarseSVMMemoryRegion>(context, CL_MEM_READ_WRITE, size, alignment);
+    }
+    // Try legacy buffer memory in case of failure
+    if(region != nullptr && region->ptr() == nullptr)
+    {
+        region = std::make_shared<CLBufferMemoryRegion>(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
+    }
+    return region;
+}
+} // namespace
+
+CLTensorAllocator::CLTensorAllocator(CLTensor *owner)
+    : _associated_memory_group(nullptr), _memory(), _owner(owner)
 {
-    _buffer = cl::Buffer();
 }
 
 uint8_t *CLTensorAllocator::data()
 {
-    return _mapping;
+    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+    return reinterpret_cast<uint8_t *>(_memory.region()->buffer());
 }
 
 const cl::Buffer &CLTensorAllocator::cl_data() const
 {
-    return _buffer;
+    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+    return _memory.region()->cl_data();
 }
 
 void CLTensorAllocator::allocate()
 {
-    ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
+    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+
     if(_associated_memory_group == nullptr)
     {
-        _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info().total_size());
+        ARM_COMPUTE_ERROR_ON(_memory.region()->cl_data().get() != nullptr);
+        _memory = CLMemory(allocate_region(CLScheduler::get().context(), info().total_size(), 0));
     }
     else
     {
-        _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(&_buffer()), info().total_size());
+        _associated_memory_group->finalize_memory(_owner, _memory.region()->handle(), info().total_size());
+        _memory.region()->set_size(info().total_size());
     }
     info().set_is_resizable(false);
 }
@@ -68,41 +89,55 @@
 {
     if(_associated_memory_group == nullptr)
     {
-        _buffer = cl::Buffer();
+        _memory = CLMemory();
         info().set_is_resizable(true);
     }
 }
 
+arm_compute::Status CLTensorAllocator::import_memory(CLMemory memory)
+{
+    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+    ARM_COMPUTE_RETURN_ERROR_ON(memory.region()->cl_data().get() == nullptr);
+    ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr);
+    _memory = memory;
+    info().set_is_resizable(false);
+
+    return Status{};
+}
+
 void CLTensorAllocator::set_associated_memory_group(CLMemoryGroup *associated_memory_group)
 {
+    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
     ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
     ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr);
-    ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
+    ARM_COMPUTE_ERROR_ON(_memory.region()->cl_data().get() != nullptr);
+    _memory                  = CLMemory(std::make_shared<CLBufferMemoryRegion>(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 0));
     _associated_memory_group = associated_memory_group;
 }
 
 uint8_t *CLTensorAllocator::lock()
 {
-    ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
-    _mapping = map(CLScheduler::get().queue(), true);
-    return _mapping;
+    return map(CLScheduler::get().queue(), true);
 }
 
 void CLTensorAllocator::unlock()
 {
-    ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
-    unmap(CLScheduler::get().queue(), _mapping);
-    _mapping = nullptr;
+    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+    unmap(CLScheduler::get().queue(), reinterpret_cast<uint8_t *>(_memory.region()->buffer()));
 }
 
 uint8_t *CLTensorAllocator::map(cl::CommandQueue &q, bool blocking)
 {
-    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
-    return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info().total_size()));
+    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+    ARM_COMPUTE_ERROR_ON(_memory.region()->buffer() != nullptr);
+    _memory.region()->map(q, blocking);
+    return reinterpret_cast<uint8_t *>(_memory.region()->buffer());
 }
 
 void CLTensorAllocator::unmap(cl::CommandQueue &q, uint8_t *mapping)
 {
-    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
-    q.enqueueUnmapMemObject(_buffer, mapping);
+    ARM_COMPUTE_UNUSED(mapping);
+    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+    ARM_COMPUTE_ERROR_ON(_memory.region()->buffer() == nullptr);
+    _memory.region()->unmap(q);
 }

diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index df8e255..5f82cd3 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp

@@ -35,61 +35,6 @@
 
 using namespace arm_compute;
 
-namespace
-{
-/* Function to be used to intercept kernel enqueues and store their OpenCL Event */
-class Interceptor
-{
-public:
-    explicit Interceptor(CLTuner &tuner);
-
-    /** clEnqueueNDRangeKernel interface
-     *
-     * @param[in] command_queue           A valid command-queue. The kernel will be queued for execution on the device associated with command_queue.
-     * @param[in] kernel                  A valid kernel object. The OpenCL context associated with kernel and command_queue must be the same.
-     * @param[in] work_dim                The number of dimensions used to specify the global work-items and work-items in the work-group. work_dim must be greater than zero and less than or equal to CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS.
-     * @param[in] gwo                     Global-Workgroup-Offset. It can be used to specify an array of work_dim unsigned values that describe the offset used to calculate the global ID of a work-item. If global_work_offset is NULL, the global IDs start at offset (0, 0, ... 0).
-     * @param[in] gws                     Global-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of global work-items in work_dim dimensions that will execute the kernel function.
-     * @param[in] lws                     Local-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of work-items that make up a work-group
-     * @param[in] num_events_in_wait_list Number of events in the waiting list
-     * @param[in] event_wait_list         Event waiting list
-     * @param[in] event                   OpenCL kernel event
-     *
-     * @return the OpenCL status
-     */
-    cl_int operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
-                      const cl_event *event_wait_list, cl_event *event);
-
-private:
-    CLTuner &_tuner;
-};
-
-Interceptor::Interceptor(CLTuner &tuner)
-    : _tuner(tuner)
-{
-}
-
-cl_int Interceptor::operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
-                               const cl_event *event_wait_list, cl_event *event)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported");
-    ARM_COMPUTE_UNUSED(event);
-    if(_tuner.kernel_event_is_set())
-    {
-        // If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues.
-        return CL_SUCCESS;
-    }
-    cl_event tmp;
-    cl_int   retval = _tuner.real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
-
-    // Set OpenCL event
-    _tuner.set_cl_kernel_event(tmp);
-
-    return retval;
-}
-
-} // namespace
-
 CLTuner::CLTuner(bool tune_new_kernels)
     : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _queue(), _queue_profiler(), _kernel_event(), _tune_new_kernels(tune_new_kernels)
 {
@@ -113,7 +58,12 @@
     return _tune_new_kernels;
 }
 
-void CLTuner::tune_kernel(ICLKernel &kernel)
+void CLTuner::tune_kernel_static(ICLKernel &kernel)
+{
+    ARM_COMPUTE_UNUSED(kernel);
+}
+
+void CLTuner::tune_kernel_dynamic(ICLKernel &kernel)
 {
     // Get the configuration ID from the kernel
     const std::string &config_id = kernel.config_id();
@@ -173,7 +123,25 @@
         }
     }
     // Start intercepting enqueues:
-    CLSymbols::get().clEnqueueNDRangeKernel_ptr = Interceptor(*this);
+    auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
+                              const cl_event * event_wait_list, cl_event * event)
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported");
+        ARM_COMPUTE_UNUSED(event);
+        if(this->kernel_event_is_set())
+        {
+            // If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues.
+            return CL_SUCCESS;
+        }
+        cl_event tmp;
+        cl_int   retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
+
+        // Set OpenCL event
+        this->set_cl_kernel_event(tmp);
+
+        return retval;
+    };
+    CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
 
     cl_ulong min_exec_time = std::numeric_limits<cl_ulong>::max();
 

diff --git a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
new file mode 100644
index 0000000..ff50073
--- /dev/null
+++ b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp

@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLChannelShuffleLayer::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLChannelShuffleLayerKernel>();
+    k->configure(input, output, num_groups);
+    _kernel = std::move(k);
+}
+
+Status CLChannelShuffleLayer::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+{
+    return CLChannelShuffleLayerKernel::validate(input, output, num_groups);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
new file mode 100644
index 0000000..c226e56
--- /dev/null
+++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp

@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
+
+using namespace arm_compute;
+
+void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
+                                               DataLayout data_layout)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLConvertFullyConnectedWeightsKernel>();
+    k->configure(input, output, original_input_shape, data_layout);
+    _kernel = std::move(k);
+}
+
+Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
+                                                DataLayout data_layout)
+{
+    return CLConvertFullyConnectedWeightsKernel::validate(input, output, original_input_shape, data_layout);
+}
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 1a486ce..47a8d5f 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp

@@ -42,25 +42,34 @@
 {
 }
 
-void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
+                                                            enable_fast_math));
 
-    switch(CLConvolutionLayer::get_convolution_method(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
-                                                      weights_info, CLScheduler::get().target()))
+    switch(CLConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info,
+                                                      weights_info, act_info, CLScheduler::get().target(), dilation, enable_fast_math))
     {
+        case ConvolutionMethod::WINOGRAD:
+        {
+            auto f = arm_compute::support::cpp14::make_unique<CLWinogradConvolutionLayer>(_memory_manager);
+            f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math);
+            _function = std::move(f);
+            break;
+        }
         case ConvolutionMethod::DIRECT:
         {
             auto f = arm_compute::support::cpp14::make_unique<CLDirectConvolutionLayer>();
-            f->configure(input, weights, biases, output, conv_info);
+            f->configure(input, weights, biases, output, conv_info, act_info);
             _function = std::move(f);
             break;
         }
         case ConvolutionMethod::GEMM:
         {
             auto f = arm_compute::support::cpp14::make_unique<CLGEMMConvolutionLayer>(_memory_manager);
-            f->configure(input, weights, biases, output, conv_info, weights_info);
+            f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info);
             _function = std::move(f);
             break;
         }
@@ -71,25 +80,30 @@
 }
 
 Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                    const WeightsInfo &weights_info)
+                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
 
-    //Configure if the parameters match the direct convolution or the gemm-based
     const GPUTarget gpu_target = CLScheduler::get().target();
 
-    switch(CLConvolutionLayer::get_convolution_method(input, weights, biases, output, conv_info, weights_info, gpu_target))
+    switch(CLConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, act_info, gpu_target, dilation, enable_fast_math))
     {
+        case ConvolutionMethod::WINOGRAD:
+        {
+            //Validate Winograd
+            ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
+            break;
+        }
         case ConvolutionMethod::DIRECT:
         {
             // Validate direct convolution layer
-            CLDirectConvolutionLayer::validate(input, weights, biases, output, conv_info);
+            ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
             break;
         }
         case ConvolutionMethod::GEMM:
         {
             // Validate gemm-based convolution layer
-            CLGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info);
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info));
             break;
         }
         default:
@@ -100,21 +114,34 @@
     return Status{};
 }
 
-ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                             const WeightsInfo &weights_info, const GPUTarget gpu_target)
+ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                                             const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation, bool enable_fast_math)
 {
-    ARM_COMPUTE_UNUSED(input);
-    ARM_COMPUTE_UNUSED(weights);
-    ARM_COMPUTE_UNUSED(biases);
-    ARM_COMPUTE_UNUSED(output);
-    ARM_COMPUTE_UNUSED(conv_info);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(weights);
     ARM_COMPUTE_UNUSED(weights_info);
     ARM_COMPUTE_UNUSED(gpu_target);
 
-    return ConvolutionMethod::GEMM;
+    const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+
+    if(dilation != Size2D(1U, 1U) || (input->dimension(idx_c) < 16))
+    {
+        return ConvolutionMethod::GEMM;
+    }
+    else
+    {
+        return bool(CLWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
+    }
 }
 
 void CLConvolutionLayer::run()
 {
+    prepare();
     _function->run();
 }
+
+void CLConvolutionLayer::prepare()
+{
+    _function->prepare();
+}

diff --git a/src/runtime/CL/functions/CLCopy.cpp b/src/runtime/CL/functions/CLCopy.cpp
new file mode 100644
index 0000000..3442e37
--- /dev/null
+++ b/src/runtime/CL/functions/CLCopy.cpp

@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLCopy.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLCopy::configure(ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLCopyKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}

diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 9e6c0b4..cb8dc02 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp

@@ -80,7 +80,7 @@
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, BorderSize(inner_border_right, inner_border_top), info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, info, WeightsInfo()));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, WeightsInfo()));
 
     return Status{};
 }

diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 88e9376..676a121 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp

@@ -24,6 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
@@ -35,17 +37,27 @@
 using namespace arm_compute::misc::shape_calculator;
 
 CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3()
-    : _kernel(), _border_handler()
+    : _kernel(nullptr), _border_handler()
 {
 }
 
-void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+                                               ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
 
-    _kernel.set_target(CLScheduler::get().target());
-    _kernel.configure(input, weights, biases, output, conv_info);
+    if(input->info()->data_layout() == DataLayout::NCHW)
+    {
+        _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
+    }
+    else
+    {
+        _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
+    }
+
+    _kernel->set_target(CLScheduler::get().target());
+    _kernel->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info);
 
     // Configure border handler
     PixelValue &&zero_value(0.f);
@@ -53,42 +65,62 @@
     {
         zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().offset));
     }
-    _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, zero_value);
+    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
+}
+
+Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                                unsigned int        depth_multiplier,
+                                                ActivationLayerInfo act_info, GPUTarget gpu_target)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW && input->data_layout() != DataLayout::NHWC);
+
+    if(input->data_layout() == DataLayout::NCHW)
+    {
+        return CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target);
+    }
+
+    return CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info);
 }
 
 void CLDepthwiseConvolutionLayer3x3::run()
 {
     CLScheduler::get().enqueue(_border_handler);
-    CLScheduler::get().enqueue(_kernel);
+    CLScheduler::get().enqueue(*_kernel);
 }
 
 CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer()
     : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
-      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_quantized(false)
+      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr)
 {
 }
 
-void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
 
     const size_t weights_w = weights->info()->dimension(0);
     const size_t weights_h = weights->info()->dimension(1);
     const size_t weights_z = weights->info()->dimension(2);
 
-    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _is_first_run     = true;
+    _original_weights = weights;
+    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
 
     bool            append_bias = (biases != nullptr) && !_is_quantized;
     const GPUTarget gpu_target  = CLScheduler::get().target();
 
     // Calculate output shape
-    TensorShape dwc_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info);
+    TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
 
     // Output width and height
-    const unsigned int conv_w = dwc_output_shape.x();
-    const unsigned int conv_h = dwc_output_shape.y();
+    const unsigned int conv_w = output_shape.x();
+    const unsigned int conv_h = output_shape.y();
 
     // Set up intermediate tensors
     const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
@@ -101,7 +133,7 @@
     shape_im2col.set(2, weights_z);
     _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
     _im2col_kernel.set_target(gpu_target);
-    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias);
+    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
 
     // Weights reshape configuration
     const TensorShape shape_weights_reshape(patch_size, weights_z);
@@ -117,7 +149,7 @@
     _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
     _v2mm_kernel.set_target(gpu_target);
     _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
-    _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(dwc_output_shape));
+    _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
     _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
 
     // Output staged configuration
@@ -152,18 +184,72 @@
     _v2mm_output.allocator()->allocate();
 }
 
+Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                             unsigned int depth_multiplier)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(2) * depth_multiplier) != weights->dimension(2));
+
+    const bool         is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+    const bool         append_bias  = (biases != nullptr) && !is_quantized;
+    const TensorShape  output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+    const size_t       weights_w    = weights->dimension(0);
+    const size_t       weights_h    = weights->dimension(1);
+    const size_t       weights_z    = weights->dimension(2);
+    const unsigned int conv_w       = output_shape.x();
+    const unsigned int conv_h       = output_shape.y();
+    const size_t       patch_size   = weights_w * weights_h + ((append_bias) ? 1 : 0);
+    const size_t       conv_size    = conv_w * conv_h;
+
+    TensorShape shape_im2col = input->tensor_shape();
+    shape_im2col.set(0, patch_size);
+    shape_im2col.set(1, conv_size);
+    shape_im2col.set(2, weights_z);
+    TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+
+    const TensorShape shape_weights_reshape(patch_size, weights_z);
+    TensorInfo        weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseWeightsReshapeKernel::validate(weights, &weights_reshaped, append_bias ? biases : nullptr));
+
+    DataType    v2mm_dt        = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
+    TensorShape shape_v2mm_out = input->tensor_shape();
+    shape_v2mm_out.set(0, conv_size * weights_z);
+    shape_v2mm_out.set(1, 1);
+    shape_v2mm_out.set(2, 1);
+    TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
+
+    TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output, conv_w, conv_h));
+
+    if(is_quantized)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
+    }
+
+    return Status{};
+}
+
 void CLDepthwiseConvolutionLayer::run()
 {
+    // Run weights reshaping (Runs once for every configure)
+    if(_is_first_run)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        CLScheduler::get().enqueue(_weights_reshape_kernel);
+        CLScheduler::get().enqueue(_v2mm_weights_fill_border);
+        _is_first_run = false;
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
+    }
+
     CLScheduler::get().enqueue(_im2col_kernel);
-
-    CLScheduler::get().enqueue(_weights_reshape_kernel);
-
     CLScheduler::get().enqueue(_v2mm_input_fill_border);
-    CLScheduler::get().enqueue(_v2mm_weights_fill_border);
     CLScheduler::get().enqueue(_v2mm_kernel);
-
     CLScheduler::get().enqueue(_vector_to_tensor_kernel);
-
     if(_is_quantized)
     {
         CLScheduler::get().enqueue(_output_stage_kernel);

diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
index 5559d42..6f33b2e 100644
--- a/src/runtime/CL/functions/CLDequantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 
 #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
 
+#include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 using namespace arm_compute;
@@ -33,8 +34,18 @@
 {
 }
 
+Status CLDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayerKernel::validate(input, output, min_max));
+
+    return Status{};
+}
+
 void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *min_max)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
+
     _dequantize_kernel.configure(input, output, min_max);
 }
 

diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index d6a335c..c451bd4 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,11 +33,11 @@
 using namespace arm_compute;
 
 CLDirectConvolutionLayer::CLDirectConvolutionLayer()
-    : _direct_conv_kernel(), _input_border_handler()
+    : _direct_conv_kernel(), _input_border_handler(), _activationlayer_function(), _is_activationlayer_enabled(false)
 {
 }
 
-void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
 {
     // Set GPU target
     _direct_conv_kernel.set_target(CLScheduler::get().target());
@@ -52,11 +52,28 @@
         zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().offset));
     }
     _input_border_handler.configure(input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+
+    // Tune kernels
+    CLScheduler::get().tune_kernel_static(_direct_conv_kernel);
+
+    _is_activationlayer_enabled = act_info.enabled();
+
+    //Configure Activation Layer
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.configure(output, nullptr, act_info);
+    }
 }
 
-Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
+Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                          const ActivationLayerInfo &act_info)
 {
-    return CLDirectConvolutionLayerKernel::validate(input, weights, biases, output, conv_info, CLScheduler::get().target());
+    ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerKernel::validate(input, weights, biases, output, conv_info, CLScheduler::get().target()));
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+    }
+    return Status{};
 }
 
 void CLDirectConvolutionLayer::run()
@@ -66,4 +83,10 @@
 
     // Run direct convolution
     CLScheduler::get().enqueue(_direct_conv_kernel);
+
+    //Run Activation Layer
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.run();
+    }
 }

diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 2b4670b..151fa1b 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp

@@ -37,10 +37,8 @@
 
 namespace
 {
-Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output, bool is_interleaved_transposed)
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
 {
-    const GPUTarget gpu_target = CLScheduler::get().target();
-
     if(is_data_type_quantized_asymmetric(input.data_type()))
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
@@ -55,7 +53,7 @@
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&input, &weights, &output, 1.f, is_interleaved_transposed, GEMMReshapeInfo(), gpu_target));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input, &weights, nullptr, &output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */)));
     }
 
     return Status{};
@@ -75,12 +73,12 @@
 }
 
 CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _accumulate_biases_kernel(), _im2col_output(),
-      _gemmlowp_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false)
+    : _memory_group(memory_manager), _im2col_kernel(), _reshape_weights_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _accumulate_biases_kernel(),
+      _im2col_output(), _gemmlowp_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false), _original_weights(nullptr)
 {
 }
 
-void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool is_interleaved_transposed)
+void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
 {
     if(_is_quantized)
     {
@@ -102,8 +100,7 @@
     else
     {
         // Configure matrix multiply kernel
-        _mm_kernel.set_target(CLScheduler::get().target());
-        _mm_kernel.configure(input, weights, output, 1.f, is_interleaved_transposed);
+        _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */));
     }
 }
 
@@ -114,7 +111,7 @@
     // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
 
     // Initialize output tensor for im2col
-    TensorShape shape_im2col = compute_im2col_shape(input->info());
+    TensorShape shape_im2col = compute_im2col_fc_shape(input->info());
     _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
 
     // Configure im2col kernel
@@ -122,7 +119,7 @@
     _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
 
     // Configure matrix multiply kernel
-    configure_mm(&_im2col_output, weights, output, false);
+    configure_mm(&_im2col_output, weights, output);
 
     // Allocate the output tensor for im2col once all the configure methods have been called
     _im2col_output.allocator()->allocate();
@@ -133,7 +130,7 @@
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
 
     // Configure matrix multiply kernel
-    configure_mm(input, weights, output, false);
+    configure_mm(input, weights, output);
 }
 
 void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights, bool are_weights_reshaped)
@@ -152,6 +149,7 @@
     _is_fc_after_conv     = true;
     _accumulate_biases    = false;
     _is_quantized         = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _original_weights     = weights;
 
     // Configure gemmlowp output
     if(_is_quantized)
@@ -222,13 +220,6 @@
         _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset);
         _gemmlowp_output.allocator()->allocate();
     }
-
-    // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
-    if(!_are_weights_reshaped)
-    {
-        // Allocate the tensor for the weights reshaped
-        _reshape_weights_output.allocator()->allocate();
-    }
 }
 
 Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose_weights, bool are_weights_reshaped)
@@ -243,7 +234,7 @@
     bool            is_quantized     = is_data_type_quantized_asymmetric(input->data_type());
     const GPUTarget gpu_target       = CLScheduler::get().target();
 
-    const ITensorInfo &im2col_input     = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_shape(input)));
+    const ITensorInfo &im2col_input     = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_fc_shape(input)));
     const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
     const ITensorInfo &gemmlowp_output  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
 
@@ -300,7 +291,7 @@
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
     }
     // Validate matrix multiply kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output, false));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output));
 
     // Validate output stage for asymmetric quantized types
     if(is_quantized)
@@ -313,12 +304,7 @@
 
 void CLFullyConnectedLayer::run()
 {
-    // Reshape of the weights (happens only once)
-    if(!_are_weights_reshaped)
-    {
-        _are_weights_reshaped = true;
-        _reshape_weights_kernel.run();
-    }
+    prepare();
 
     _memory_group.acquire();
 
@@ -335,7 +321,7 @@
     }
     else
     {
-        CLScheduler::get().enqueue(_mm_kernel, !_accumulate_biases);
+        _mm_gemm.run();
     }
 
     // Accumulate biases if provided
@@ -353,3 +339,30 @@
 
     _memory_group.release();
 }
+
+void CLFullyConnectedLayer::prepare()
+{
+    // Reshape of the weights (happens only once)
+    if(!_are_weights_reshaped)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run reshape weights kernel and mark weights as unused
+        _reshape_weights_output.allocator()->allocate();
+        _reshape_weights_kernel.run();
+        _original_weights->mark_as_unused();
+
+        // Prepare GEMM prepare and release unused weights
+        if(!_is_quantized)
+        {
+            _mm_gemm.prepare();
+            if(!_reshape_weights_output.is_used())
+            {
+                _reshape_weights_output.allocator()->free();
+            }
+        }
+
+        CLScheduler::get().queue().finish();
+        _are_weights_reshaped = true;
+    }
+}

diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 6b5cd2d..f81da6c 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp

@@ -29,14 +29,18 @@
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 
 using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
@@ -44,9 +48,10 @@
 {
     bool flag = true;
 
-    if(gpu_target == GPUTarget::BIFROST)
+    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX))
     {
-        if(k > 256 && m > 4 && data_type == DataType::F32 && reshape_b_only_on_first_run)
+        // COMPMID-852
+        if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
         {
             const float scale = k < 1024 ? 2.0f : 2.5f;
             flag              = (scale * n) > ((1.66f * n) + 38.4f);
@@ -56,39 +61,19 @@
             flag = false;
         }
     }
-
-    return flag;
-}
-
-Status validate_arguments(const ITensorInfo *a, const ITensorInfo *b, const ICLTensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info = GEMMInfo())
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-
-    if(c != nullptr)
+    else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, c->info());
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(c->info()->dimension(0) != output->dimension(0), "The C matrix must have the same number of rows as the output matrix");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(c->info()->dimension(1) != output->dimension(1), "The C matrix must have the same number of columns as the output matrix");
+        // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once
+        flag = m != 1 && reshape_b_only_on_first_run;
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_UNUSED(beta);
-    return Status{};
+    return flag;
 }
 } // namespace
 
 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false),
-      _is_first_run(true), _reshape_b_only_on_first_run(false)
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _original_b(nullptr), _is_interleaved_transposed(false),
+      _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
 {
 }
 
@@ -97,10 +82,14 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(a->info(), b->info(), c, output->info(), alpha, beta, gemm_info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info));
+
+    // Store original b matrix
+    _original_b = b;
 
     // Check if we need to reshape the matrix B only on the first run
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+    _is_prepared                 = false;
 
     const ICLTensor *matrix_a = a;
     const ICLTensor *matrix_b = b;
@@ -121,7 +110,7 @@
     int       mult_transpose1xW_width   = 1;
     int       mult_interleave4x4_height = 1;
 
-    if(gpu_target == GPUTarget::BIFROST)
+    if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
     {
         mult_transpose1xW_width   = 4;
         mult_interleave4x4_height = 2;
@@ -137,8 +126,10 @@
 
         // Manage intermediate buffers
         _memory_group.manage(&_tmp_a);
-        _memory_group.manage(&_tmp_b);
-
+        if(!_reshape_b_only_on_first_run)
+        {
+            _memory_group.manage(&_tmp_b);
+        }
         // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
 
         // Configure interleave kernel
@@ -154,7 +145,10 @@
     {
         // Allocate intermediate tensors
         _tmp_a.allocator()->allocate();
-        _tmp_b.allocator()->allocate();
+        if(!_reshape_b_only_on_first_run)
+        {
+            _tmp_b.allocator()->allocate();
+        }
     }
 
     // Configure matrix addition kernel
@@ -165,14 +159,74 @@
     }
 }
 
-Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ICLTensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info)
+Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(a, b, c, output, alpha, beta, gemm_info));
+    ARM_COMPUTE_UNUSED(alpha);
+
+    // Check if we need to reshape the matrix B only on the first run
+    const bool reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+
+    const ITensorInfo *matrix_a_info = a;
+    const ITensorInfo *matrix_b_info = b;
+
+    TensorInfo tmp_a_info{};
+    TensorInfo tmp_b_info{};
+    TensorInfo tmp_output_info = *output->clone();
+
+    // Get the GPU target
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
+    // Arguments used by GEMMReshapeInfo
+    // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
+    // in order to know how the matrices have been reshaped
+    const int m                         = a->dimension(1);
+    const int n                         = b->dimension(0);
+    const int k                         = a->dimension(0);
+    int       mult_transpose1xW_width   = 1;
+    int       mult_interleave4x4_height = 1;
+
+    if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
+    {
+        mult_transpose1xW_width   = 4;
+        mult_interleave4x4_height = 2;
+    }
+
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height);
+
+    // Check if we need to reshape the matrix A and matrix B
+    const bool run_interleave_transpose = is_interleaved_transposed(m, n, k, a->data_type(), reshape_b_only_on_first_run, gpu_target);
+
+    if(run_interleave_transpose)
+    {
+        matrix_a_info = &tmp_a_info;
+        matrix_b_info = &tmp_b_info;
+
+        // Validate interleave kernel
+        auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &tmp_a_info, mult_interleave4x4_height));
+
+        // Validate transpose kernel
+        auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &tmp_b_info, mult_transpose1xW_width));
+    }
+
+    // Validate matrix multiply
+    auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info, gpu_target));
+
+    if(beta != 0 && c != nullptr)
+    {
+        // Validate matrix addition kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, &tmp_output_info, beta));
+    }
+
     return Status{};
 }
 
 void CLGEMM::run()
 {
+    prepare();
+
     _memory_group.acquire();
 
     if(_is_interleaved_transposed)
@@ -180,14 +234,7 @@
         // Run interleave kernel
         CLScheduler::get().enqueue(_interleave_kernel, false);
 
-        if(_is_first_run)
-        {
-            // Run transpose kernel
-            CLScheduler::get().enqueue(_transpose_kernel, false);
-
-            _is_first_run = false;
-        }
-        else if(!_reshape_b_only_on_first_run)
+        if(!_reshape_b_only_on_first_run)
         {
             // Run transpose kernel
             CLScheduler::get().enqueue(_transpose_kernel, false);
@@ -205,3 +252,19 @@
 
     _memory_group.release();
 }
+
+void CLGEMM::prepare()
+{
+    if(!_is_prepared)
+    {
+        if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+        {
+            // Run transpose kernel
+            _tmp_b.allocator()->allocate();
+            CLScheduler::get().enqueue(_transpose_kernel, false);
+            _original_b->mark_as_unused();
+        }
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
+}

diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index c58af36..79495e4 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp

@@ -38,8 +38,8 @@
 using namespace arm_compute;
 using namespace arm_compute::misc::shape_calculator;
 
-CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped()
+CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights()
+    : _weights_reshape_kernel()
 {
 }
 
@@ -86,16 +86,12 @@
 
 void CLConvolutionLayerReshapeWeights::run()
 {
-    _memory_group.acquire();
-
     CLScheduler::get().enqueue(_weights_reshape_kernel);
-
-    _memory_group.release();
 }
 
 CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _im2col_output(),
-      _interleave_output(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_first_run(true)
+    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
+      _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
 {
 }
 
@@ -155,7 +151,8 @@
     return Status{};
 }
 
-void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+                                       const Size2D &dilation, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
@@ -164,9 +161,13 @@
                                                                 biases != nullptr ? biases->info() : nullptr,
                                                                 output->info(),
                                                                 conv_info,
-                                                                weights_info));
+                                                                weights_info,
+                                                                dilation,
+                                                                act_info));
 
-    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _is_prepared      = false;
+    _original_weights = weights;
+    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
 
     const DataType dt = input->info()->data_type();
 
@@ -191,7 +192,7 @@
     const unsigned int kernel_width  = weights->info()->dimension(0);
     const unsigned int kernel_height = weights->info()->dimension(1);
     std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
-                                                 conv_info);
+                                                 conv_info, dilation);
 
     unsigned int mat_weights_cols = weights->info()->dimension(3);
     unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element;
@@ -226,7 +227,7 @@
     _memory_group.manage(&_gemm_output);
 
     // Configure im2col
-    _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias);
+    _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation);
 
     // Configure GEMM
     configure_mm(&_im2col_output, weights, &_gemm_output);
@@ -255,14 +256,19 @@
 
     ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
 
-    // Allocate intermediate tensor
-    _weights_reshaped.allocator()->allocate();
+    //Configure Activation Layer
+    _is_activationlayer_enabled = act_info.enabled();
+
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.configure(output, nullptr, act_info);
+    }
 
     ARM_COMPUTE_UNUSED(weights_info);
 }
 
 Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                        const WeightsInfo &weights_info)
+                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
@@ -272,6 +278,11 @@
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
 
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_ERROR_ON(act_info.b() > act_info.a());
+    }
+
     const bool     is_quantized = is_data_type_quantized_asymmetric(input->data_type());
     const bool     append_bias  = (biases != nullptr) && (!is_quantized);
     const unsigned bias_element = (append_bias) ? 1 : 0;
@@ -284,12 +295,12 @@
     const unsigned int kernel_width  = weights->dimension(0);
     const unsigned int kernel_height = weights->dimension(1);
 
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height, conv_info);
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height, conv_info, dilation);
 
     unsigned int mat_weights_cols = weights->dimension(3);
     unsigned int mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + bias_element;
 
-    CLConvolutionLayerReshapeWeights::validate(weights, biases, nullptr);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, is_quantized ? nullptr : biases, nullptr));
 
     // Create tensor info for im2col reshaped inputs
     const unsigned int mat_input_cols = mat_weights_rows;
@@ -300,7 +311,7 @@
     shape_im2col.set(2, 1);
     TensorInfo im2col_reshaped_info(shape_im2col, 1, dt, input->fixed_point_position());
     im2col_reshaped_info.set_quantization_info(input->quantization_info());
-    CLIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
 
     // Create GEMM output tensor
     TensorShape shape_gemm = im2col_reshaped_info.tensor_shape();
@@ -311,9 +322,10 @@
     TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->fixed_point_position());
     info_gemm.set_quantization_info(output->quantization_info());
 
-    validate_mm(&im2col_reshaped_info, weights, &info_gemm);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(&im2col_reshaped_info, weights, &info_gemm));
+    TensorInfo tmp_info(shape_gemm, 1, DataType::QASYMM8, input->fixed_point_position());
+    tmp_info.set_quantization_info(output->quantization_info());
 
-    TensorInfo tmp_info(input->tensor_shape(), 1, DataType::QASYMM8, input->fixed_point_position());
     if(is_quantized)
     {
         float multiplier = input->quantization_info().scale * weights->quantization_info().scale / output->quantization_info().scale;
@@ -324,7 +336,7 @@
     }
 
     // Validate Col2Im
-    CLCol2ImKernel::validate(is_quantized ? &tmp_info : &info_gemm, output, std::make_pair(conv_w, conv_h));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(is_quantized ? &tmp_info : &info_gemm, output, std::make_pair(conv_w, conv_h)));
 
     if(biases != nullptr)
     {
@@ -341,18 +353,18 @@
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
     }
 
+    //Validate Activation Layer
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+    }
+
     return Status{};
 }
 
 void CLGEMMConvolutionLayer::run()
 {
-    // Run weights reshaping (Runs once for every configure)
-    if(_is_first_run)
-    {
-        _reshape_weights.run();
-
-        _is_first_run = false;
-    }
+    prepare();
 
     _memory_group.acquire();
 
@@ -377,5 +389,36 @@
     // Reshape output matrix
     CLScheduler::get().enqueue(_col2im_kernel, false);
 
+    //Run Activation Layer if enabled
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.run();
+    }
+
     _memory_group.release();
 }
+
+void CLGEMMConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        // Run weights reshaping and mark as unused
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+        _weights_reshaped.allocator()->allocate();
+        _reshape_weights.run();
+        _original_weights->mark_as_unused();
+
+        // Run GEMM prepare
+        if(!_is_quantized)
+        {
+            _mm_gemm.prepare();
+            if(!_weights_reshaped.is_used())
+            {
+                _weights_reshaped.allocator()->free();
+            }
+        }
+
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
+}

diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index c688299..711b006 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp

@@ -41,7 +41,7 @@
 {
     bool flag = true;
 
-    if(gpu_target == GPUTarget::BIFROST)
+    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX))
     {
         // COMPMID-852
         if(k > 256 && m > 4 && reshape_b_only_on_first_run)
@@ -102,7 +102,10 @@
         matrix_b = &_tmp_b;
 
         _memory_group.manage(&_tmp_a);
-        _memory_group.manage(&_tmp_b);
+        if(!_reshape_b_only_on_first_run)
+        {
+            _memory_group.manage(&_tmp_b);
+        }
 
         // Configure interleave kernel
         _mtx_a_reshape_kernel.configure(a, &_tmp_a, mult_interleave4x4_height);
@@ -119,7 +122,10 @@
     {
         TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
         _vector_sum_col.allocator()->init(info_vector_sum_col);
-        _memory_group.manage(&_vector_sum_col);
+        if(!_reshape_b_only_on_first_run)
+        {
+            _memory_group.manage(&_vector_sum_col);
+        }
 
         // Configure Matrix B reduction kernel
         _mtx_b_reduction_kernel.configure(b, &_vector_sum_col);

diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
index 4b32954..ddce5fb 100644
--- a/src/runtime/CL/functions/CLGaussianPyramid.cpp
+++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,7 +49,8 @@
 }
 
 CLGaussianPyramidHalf::CLGaussianPyramidHalf() // NOLINT
-    : _border_handler(),
+    : _horizontal_border_handler(),
+      _vertical_border_handler(),
       _horizontal_reduction(),
       _vertical_reduction()
 {
@@ -64,6 +65,9 @@
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
     ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_HALF != pyramid->info()->scale());
 
+    // Constant value to use for vertical fill border when the border mode is CONSTANT
+    const uint16_t pixel_value_u16 = static_cast<uint16_t>(constant_border_value) * 2 + static_cast<uint16_t>(constant_border_value) * 8 + static_cast<uint16_t>(constant_border_value) * 6;
+
     /* Get number of pyramid levels */
     const size_t num_levels = pyramid->info()->num_levels();
 
@@ -72,28 +76,31 @@
 
     if(num_levels > 1)
     {
-        _border_handler       = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
-        _horizontal_reduction = arm_compute::support::cpp14::make_unique<CLGaussianPyramidHorKernel[]>(num_levels - 1);
-        _vertical_reduction   = arm_compute::support::cpp14::make_unique<CLGaussianPyramidVertKernel[]>(num_levels - 1);
+        _horizontal_border_handler = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
+        _vertical_border_handler   = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
+        _horizontal_reduction      = arm_compute::support::cpp14::make_unique<CLGaussianPyramidHorKernel[]>(num_levels - 1);
+        _vertical_reduction        = arm_compute::support::cpp14::make_unique<CLGaussianPyramidVertKernel[]>(num_levels - 1);
 
         // Apply half scale to the X dimension of the tensor shape
         TensorShape tensor_shape = pyramid->info()->tensor_shape();
         tensor_shape.set(0, (pyramid->info()->width() + 1) * SCALE_PYRAMID_HALF);
 
         PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::U16);
-
         _tmp.init(pyramid_info);
 
         for(size_t i = 0; i < num_levels - 1; ++i)
         {
             /* Configure horizontal kernel */
-            _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode == BorderMode::UNDEFINED);
+            _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
 
             /* Configure vertical kernel */
-            _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), border_mode == BorderMode::UNDEFINED);
+            _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
 
             /* Configure border */
-            _border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+            _horizontal_border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+
+            /* Configure border */
+            _vertical_border_handler[i].configure(_tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16));
         }
         _tmp.allocate();
     }
@@ -110,13 +117,15 @@
     _pyramid->get_pyramid_level(0)->map(CLScheduler::get().queue(), true /* blocking */);
     _input->map(CLScheduler::get().queue(), true /* blocking */);
     _pyramid->get_pyramid_level(0)->copy_from(*_input);
+
     _input->unmap(CLScheduler::get().queue());
     _pyramid->get_pyramid_level(0)->unmap(CLScheduler::get().queue());
 
     for(unsigned int i = 0; i < num_levels - 1; ++i)
     {
-        CLScheduler::get().enqueue(_border_handler[i], false);
+        CLScheduler::get().enqueue(_horizontal_border_handler[i], false);
         CLScheduler::get().enqueue(_horizontal_reduction[i], false);
+        CLScheduler::get().enqueue(_vertical_border_handler[i], false);
         CLScheduler::get().enqueue(_vertical_reduction[i], false);
     }
 }

diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index d1bb65f..a3010a7 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,6 +52,26 @@
     _sumsq.allocator()->allocate();
 }
 
+Status CLL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, float epsilon)
+{
+    TensorShape shape(input->tensor_shape());
+
+    // Create intermediate tensor info
+    TensorInfo sum_sq;
+    sum_sq.set_data_type(input->data_type());
+    sum_sq.set_tensor_shape(shape);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, &sum_sq, axis, ReductionOperation::SUM_SQUARE));
+
+    // Reduce shape on axis (supported axis is 0)
+    shape.set(0, 1);
+    sum_sq.set_tensor_shape(shape);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLL2NormalizeLayerKernel::validate(input, &sum_sq, output, axis, epsilon));
+
+    return Status{};
+}
+
 void CLL2NormalizeLayer::run()
 {
     _memory_group.acquire();

diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
new file mode 100644
index 0000000..930d311
--- /dev/null
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp

@@ -0,0 +1,508 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLSTMLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cmath>
+#include <memory>
+#include <tuple>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+CLLSTMLayer::CLLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _gemm_input_gate1(), _gemm_input_gate2(), _transpose_input_gate1(), _transpose_input_gate2(), _accum_input_gate1(),
+      _accum_input_gate2(), _subtract_input_gate(), _activation_input_gate(), _fully_connected_forget_gate(), _gemm_forget_gate1(), _gemm_forget_gate2(), _transpose_forget_gate1(),
+      _transpose_forget_gate2(), _accum_forget_gate1(), _accum_forget_gate2(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _gemm_cell_state2(), _transpose_cell_state1(),
+      _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), _gemm_output1(),
+      _gemm_output2(), _transpose_output1(), _transpose_output2(), _accum_output1(), _accum_output2(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state(),
+      _fully_connected_output_state(), _gemm_output_state(), _accum_output_state(), _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _input_gate_out1(), _input_gate_out2(),
+      _input_gate_out3(), _input_gate_out4(), _input_gate_out5(), _input_gate_out6(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(),
+      _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _output5(), _output6(),
+      _cell_state_activation(), _output_projection1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false),
+      _perform_projection_clipping(false)
+{
+}
+
+void CLLSTMLayer::configure(const ICLTensor *input, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
+                            const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
+                            const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
+                            ICLTensor *output_state, ICLTensor *cell_state, ICLTensor *scratch_buffer, ICLTensor *output, const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info,
+                            float cell_threshold, float projection_threshold)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+                                 forget_gate_bias, cell_bias, output_gate_bias, output_state, cell_state);
+    LSTMParams<ITensorInfo> lstm_params_info;
+    if(lstm_params.has_peephole_opt())
+    {
+        lstm_params_info.set_peephole_params(lstm_params.cell_to_input_weights()->info(), lstm_params.cell_to_forget_weights()->info(), lstm_params.cell_to_output_weights()->info());
+    }
+    if(lstm_params.has_projection())
+    {
+        lstm_params_info.set_projection_params(lstm_params.projection_weights()->info(), lstm_params.projection_bias()->info());
+    }
+    if(!lstm_params.has_cifg_opt())
+    {
+        lstm_params_info.set_cifg_params(lstm_params.input_to_input_weights()->info(), lstm_params.recurrent_to_input_weights()->info(),
+                                         lstm_params.cell_to_input_weights()->info(), lstm_params.input_gate_bias()->info());
+    }
+    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(input->info(), input_to_forget_weights->info(),
+                                                     input_to_cell_weights->info(), input_to_output_weights->info(),
+                                                     recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+                                                     forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
+                                                     output_state->info(), cell_state->info(), scratch_buffer->info(), output->info(), lstm_params_info,
+                                                     activation_info, cell_threshold, projection_threshold));
+
+    const TensorShape cell_state_shape = cell_state->info()->tensor_shape();
+
+    TensorShape forget_gate1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+    TensorShape forget_gate2_shape = compute_transposed_shape(*forget_gate_bias->info());
+    TensorShape forget_gate3_shape{ 1, output_state->info()->dimension(1) };
+    _forget_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _forget_gate_out2.allocator()->init(TensorInfo(forget_gate1_shape, 1, input->info()->data_type()));
+    _forget_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _forget_gate_out6.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+    // Configure block that calculates the forget gate
+    // forget_gate = Activation(input * input_to_forget_weights + output_state * recurrent_to_forget_weights + cell_state * cell_to_forget_weights + forget_gate_bias)
+    _memory_group.manage(&_forget_gate_out1);
+    _fully_connected_forget_gate.configure(input, input_to_forget_weights, forget_gate_bias, &_forget_gate_out1, true, false);
+    _memory_group.manage(&_forget_gate_out2);
+    _transpose_forget_gate1.configure(recurrent_to_forget_weights, &_forget_gate_out2);
+    _memory_group.manage(&_forget_gate_out3);
+    _gemm_forget_gate1.configure(output_state, &_forget_gate_out2, nullptr, &_forget_gate_out3, 1.f, 0.f);
+    _forget_gate_out2.allocator()->allocate();
+    _memory_group.manage(&_forget_gate_out6);
+    _accum_forget_gate1.configure(&_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out6, ConvertPolicy::SATURATE);
+    CLTensor *forget_gate_out = &_forget_gate_out6;
+
+    if(lstm_params.has_peephole_opt())
+    {
+        _forget_gate_out4.allocator()->init(TensorInfo(forget_gate2_shape, 1, input->info()->data_type()));
+        _forget_gate_out5.allocator()->init(TensorInfo(forget_gate3_shape, 1, input->info()->data_type()));
+
+        _run_peephole_opt = true;
+        _memory_group.manage(&_forget_gate_out4);
+        _transpose_forget_gate2.configure(lstm_params.cell_to_forget_weights(), &_forget_gate_out4);
+        _memory_group.manage(&_forget_gate_out5);
+        _gemm_forget_gate2.configure(cell_state, &_forget_gate_out4, nullptr, &_forget_gate_out5, 1.f, 0.f);
+        _forget_gate_out4.allocator()->allocate();
+        _accum_forget_gate2.configure(&_forget_gate_out6, &_forget_gate_out5, &_forget_gate_out3, ConvertPolicy::SATURATE);
+        _forget_gate_out5.allocator()->allocate();
+        _forget_gate_out6.allocator()->allocate();
+        forget_gate_out = &_forget_gate_out3;
+    }
+    else
+    {
+        _forget_gate_out3.allocator()->allocate();
+    }
+    _activation_forget_gate.configure(forget_gate_out, &_forget_gate_out1, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    forget_gate_out->allocator()->allocate();
+
+    TensorShape input_gate3_shape{ 1, output_state->info()->dimension(1) };
+    _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _input_gate_out5.allocator()->init(TensorInfo(input_gate3_shape, 1, input->info()->data_type()));
+
+    // Configure block that calculates the input gate
+    // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + cell_state * cell_to_input_weights + input_gate_bias), without CIFG
+    // input_gate = 1 - forget_gate, with CIFG
+    if(lstm_params.has_cifg_opt())
+    {
+        _memory_group.manage(&_input_gate_out1);
+        _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+        _subtract_input_gate.configure(&_ones, &_forget_gate_out1, &_input_gate_out1, ConvertPolicy::SATURATE);
+        _ones.allocator()->allocate();
+        _run_cifg_opt = true;
+    }
+    else
+    {
+        TensorShape input_gate1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+        TensorShape input_gate2_shape = compute_transposed_shape(*lstm_params.cell_to_input_weights()->info());
+
+        _input_gate_out2.allocator()->init(TensorInfo(input_gate1_shape, 1, input->info()->data_type()));
+        _input_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+        _input_gate_out4.allocator()->init(TensorInfo(input_gate2_shape, 1, input->info()->data_type()));
+        _input_gate_out6.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+        _memory_group.manage(&_input_gate_out1);
+        _fully_connected_input_gate.configure(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &_input_gate_out1, true, false);
+        _memory_group.manage(&_input_gate_out2);
+        _transpose_input_gate1.configure(lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
+        _memory_group.manage(&_input_gate_out3);
+        _gemm_input_gate1.configure(output_state, &_input_gate_out2, nullptr, &_input_gate_out3, 1.f, 0.f);
+        _input_gate_out2.allocator()->allocate();
+        _memory_group.manage(&_input_gate_out4);
+        _transpose_input_gate2.configure(lstm_params.cell_to_input_weights(), &_input_gate_out4);
+        _memory_group.manage(&_input_gate_out5);
+        _gemm_input_gate2.configure(cell_state, &_input_gate_out4, nullptr, &_input_gate_out5, 1.f, 0.f);
+        _input_gate_out4.allocator()->allocate();
+        _memory_group.manage(&_input_gate_out6);
+        _accum_input_gate1.configure(&_input_gate_out1, &_input_gate_out3, &_input_gate_out6, ConvertPolicy::SATURATE);
+        _input_gate_out3.allocator()->allocate();
+        _accum_input_gate2.configure(&_input_gate_out6, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+        _input_gate_out5.allocator()->allocate();
+        _input_gate_out6.allocator()->allocate();
+        _activation_input_gate.configure(&_input_gate_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    }
+
+    TensorShape cell_state1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+    _cell_state_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _cell_state_out2.allocator()->init(TensorInfo(cell_state1_shape, 1, input->info()->data_type()));
+    _cell_state_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _cell_state_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+    // Configure block that calculates the cell state
+    // cell_state = Clip((RixelwiseMul(input_gate, Activation(input * input_to_cell_weights + output_state * recurrent_to_cell_weights + cell_bias)) + PixelwiseMul(forget_gate, cell_state)), cell_threshold)
+    _memory_group.manage(&_cell_state_out1);
+    _fully_connected_cell_state.configure(input, input_to_cell_weights, cell_bias, &_cell_state_out1, true, false);
+    _memory_group.manage(&_cell_state_out2);
+    _transpose_cell_state1.configure(recurrent_to_cell_weights, &_cell_state_out2);
+    _memory_group.manage(&_cell_state_out3);
+    _gemm_cell_state1.configure(output_state, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
+    _cell_state_out2.allocator()->allocate();
+    _memory_group.manage(&_cell_state_out4);
+    _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
+    _activation_cell_state.configure(&_cell_state_out4, nullptr, activation_info);
+    _memory_group.manage(&_cell_state_out5);
+    _pixelwise_mul_cell_state1.configure(&_cell_state_out4, &_input_gate_out1, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _input_gate_out1.allocator()->allocate();
+    _cell_state_out4.allocator()->allocate();
+    _pixelwise_mul_cell_state2.configure(&_forget_gate_out1, cell_state, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _forget_gate_out1.allocator()->allocate();
+    _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
+    _cell_state_out3.allocator()->allocate();
+    _cell_state_out5.allocator()->allocate();
+
+    // Perform clipping
+    if(cell_threshold != 0.f)
+    {
+        _perform_cell_clipping = true;
+        _cell_clip.configure(&_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold));
+    }
+
+    TensorShape output1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+    TensorShape output2_shape = compute_transposed_shape(*cell_bias->info());
+    TensorShape output3_shape{ 1, output_state->info()->dimension(1) };
+    _output1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _output2.allocator()->init(TensorInfo(output1_shape, 1, input->info()->data_type()));
+    _output3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _output6.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+    // Configure block that calculates the output
+    // output_gate = Activation(input * input_to_output_weights + output_state * recurrent_to_output_weights + cell_state * cell_to_output_weights + output_gate_bias)
+    _memory_group.manage(&_output1);
+    _fully_connected_output.configure(input, input_to_output_weights, output_gate_bias, &_output1, true, false);
+    _memory_group.manage(&_output2);
+    _transpose_output1.configure(recurrent_to_output_weights, &_output2);
+    _memory_group.manage(&_output3);
+    _gemm_output1.configure(output_state, &_output2, nullptr, &_output3, 1.f, 0.f);
+    _output2.allocator()->allocate();
+    _memory_group.manage(&_output6);
+    _accum_output1.configure(&_output1, &_output3, &_output6, ConvertPolicy::SATURATE);
+    _output3.allocator()->allocate();
+    CLTensor *output_gate_out = &_output6;
+    if(lstm_params.has_peephole_opt())
+    {
+        _output4.allocator()->init(TensorInfo(output2_shape, 1, input->info()->data_type()));
+        _output5.allocator()->init(TensorInfo(output3_shape, 1, input->info()->data_type()));
+
+        _memory_group.manage(&_output4);
+        _transpose_output2.configure(lstm_params.cell_to_output_weights(), &_output4);
+        _memory_group.manage(&_output5);
+        _gemm_output2.configure(&_cell_state_out1, &_output4, nullptr, &_output5, 1.f, 0.f);
+        _accum_output2.configure(&_output6, &_output5, &_output1, ConvertPolicy::SATURATE);
+        _output6.allocator()->allocate();
+        output_gate_out = &_output1;
+
+        // Allocate intermediate buffers
+        _output4.allocator()->allocate();
+        _output5.allocator()->allocate();
+    }
+    else
+    {
+        _output1.allocator()->allocate();
+    }
+    _activation_output.configure(output_gate_out, output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    output_gate_out->allocator()->allocate();
+
+    _cell_state_activation.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+    // Configure block that calculates the output state
+    /** lstm_res = PixelwiseMul(output, Activation(cell_state))
+     *
+     *                      -- Clip(lstm_res * projection_weights + projection_bias, projection_threshold) , if there is a projection
+     *                     /
+     *  output_state =  --
+     *                     \
+     *                      -- lstm_res , otherwise
+     */
+    _memory_group.manage(&_cell_state_activation);
+    _activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info);
+    _pixelwise_mul_output_state.configure(&_cell_state_activation, output, output_state, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _cell_state_activation.allocator()->allocate();
+
+    if(lstm_params.has_projection())
+    {
+        _has_projection_weights = true;
+        _output_projection1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+        _memory_group.manage(&_output_projection1);
+        _fully_connected_output_state.configure(output_state, lstm_params.projection_weights(), lstm_params.projection_bias(), &_output_projection1, true, false);
+        // Perform clipping
+        if(projection_threshold != 0.f)
+        {
+            _perform_projection_clipping = true;
+            _projection_clip.configure(&_output_projection1, output_state, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
+        }
+
+        // Allocate intermediate buffer
+        _output_projection1.allocator()->allocate();
+    }
+
+    // Copy cell state and output
+    _copy_cell_state.configure(&_cell_state_out1, cell_state);
+    _cell_state_out1.allocator()->allocate();
+    _copy_output.configure(output_state, output);
+
+    // Vector for holding the tensors to store in scratch buffer
+    std::vector<ICLTensor *> scratch_inputs;
+    if(lstm_params.has_cifg_opt())
+    {
+        scratch_inputs.emplace_back(&_input_gate_out1);
+    }
+    scratch_inputs.emplace_back(&_cell_state_out1);
+    scratch_inputs.emplace_back(forget_gate_out);
+    scratch_inputs.emplace_back(output_gate_out);
+    _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
+}
+
+Status CLLSTMLayer::validate(const ITensorInfo *input, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
+                             const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
+                             const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
+                             const ITensorInfo *output_state, const ITensorInfo *cell_state, const ITensorInfo *scratch_buffer, const ITensorInfo *output,
+                             const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+                                        forget_gate_bias, cell_bias, output_gate_bias, output_state, cell_state);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                                       recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state, cell_state);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(input_to_forget_weights->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(input_to_cell_weights->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_forget_weights->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_cell_weights->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->num_dimensions() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_gate_bias->num_dimensions() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_state->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(cell_state->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(scratch_buffer->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
+
+    if(lstm_params.has_peephole_opt())
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights(), lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() != 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_output_weights()->num_dimensions() != 1);
+    }
+
+    TensorShape      units_out_transposed_shape = compute_transposed_shape(*recurrent_to_output_weights);
+    TensorShape      gemmv_shape{ 1, output_state->dimension(1) };
+    TensorShape      num_units_transposed_shape = compute_transposed_shape(*forget_gate_bias);
+    const TensorInfo units_out_transposed_info  = TensorInfo(units_out_transposed_shape, 1, input->data_type());
+    const TensorInfo gemmv_shape_info           = TensorInfo(gemmv_shape, 1, input->data_type());
+    const TensorInfo num_units_transposed_info  = TensorInfo(num_units_transposed_shape, 1, input->data_type());
+
+    // Validate forget gate
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, cell_state, true, false));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state, &units_out_transposed_info, nullptr, cell_state, 1.f, 0.f, GEMMInfo()));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(cell_state, cell_state, cell_state, ConvertPolicy::SATURATE));
+    if(lstm_params.has_peephole_opt())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(cell_state, &num_units_transposed_info, nullptr, &gemmv_shape_info, 1.f, 0.f, GEMMInfo()));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(cell_state, &gemmv_shape_info, cell_state, ConvertPolicy::SATURATE));
+    }
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, cell_state, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+
+    // Validate input gate
+    if(!lstm_params.has_cifg_opt())
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.cell_to_input_weights(), lstm_params.input_gate_bias());
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() != 2);
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() != 2);
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() != 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() != 1);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), cell_state, true, false));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(cell_state, &num_units_transposed_info, nullptr, &gemmv_shape_info, 1.f, 0.f, GEMMInfo()));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(cell_state, &gemmv_shape_info, cell_state, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtractionKernel::validate(cell_state, cell_state, cell_state, ConvertPolicy::SATURATE));
+    }
+
+    // Validate cell state
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_cell_weights, cell_bias, cell_state, true, false));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, nullptr, activation_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state, cell_state, cell_state, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+
+    if(cell_threshold != 0.f)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold)));
+    }
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, output_gate_bias, cell_state, true, false));
+    if(lstm_params.has_peephole_opt())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(cell_state, cell_state, cell_state, ConvertPolicy::SATURATE));
+    }
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+
+    // Validate output state
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, cell_state, activation_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state, output, output_state, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+    if(lstm_params.has_projection())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(output_state, lstm_params.projection_weights(), lstm_params.projection_bias(), cell_state, true, false));
+        if(projection_threshold != 0.f)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, output_state, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold,
+                                                                                                                        projection_threshold)));
+        }
+    }
+
+    std::vector<TensorInfo> inputs_vector_info;
+    if(lstm_params.has_cifg_opt())
+    {
+        inputs_vector_info.emplace_back(*cell_state);
+    }
+    inputs_vector_info.emplace_back(*cell_state);
+    inputs_vector_info.emplace_back(*cell_state);
+    inputs_vector_info.emplace_back(*cell_state);
+
+    std::vector<ITensorInfo *> inputs_vector_info_raw;
+    for(auto &input : inputs_vector_info)
+    {
+        inputs_vector_info_raw.emplace_back(&input);
+    }
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayer::validate(inputs_vector_info_raw, scratch_buffer));
+    return Status{};
+}
+
+void CLLSTMLayer::run()
+{
+    _memory_group.acquire();
+
+    _fully_connected_forget_gate.run();
+    CLScheduler::get().enqueue(_transpose_forget_gate1);
+    _gemm_forget_gate1.run();
+    CLScheduler::get().enqueue(_accum_forget_gate1);
+
+    if(_run_peephole_opt)
+    {
+        CLScheduler::get().enqueue(_transpose_forget_gate2);
+        _gemm_forget_gate2.run();
+        _accum_forget_gate2.run();
+    }
+    CLScheduler::get().enqueue(_activation_forget_gate);
+
+    if(_run_cifg_opt)
+    {
+        _ones.map(true);
+        std::fill_n(_ones.buffer(), _ones.info()->total_size(), 1);
+        _ones.unmap();
+        CLScheduler::get().enqueue(_subtract_input_gate);
+    }
+    else
+    {
+        _fully_connected_input_gate.run();
+        CLScheduler::get().enqueue(_transpose_input_gate1);
+        _gemm_input_gate1.run();
+        CLScheduler::get().enqueue(_transpose_input_gate2);
+        _gemm_input_gate2.run();
+        CLScheduler::get().enqueue(_accum_input_gate1);
+        _accum_input_gate2.run();
+        CLScheduler::get().enqueue(_activation_input_gate);
+    }
+
+    _fully_connected_cell_state.run();
+    CLScheduler::get().enqueue(_transpose_cell_state1);
+    _gemm_cell_state1.run();
+    CLScheduler::get().enqueue(_accum_cell_state1);
+    CLScheduler::get().enqueue(_activation_cell_state);
+    CLScheduler::get().enqueue(_pixelwise_mul_cell_state1);
+    CLScheduler::get().enqueue(_pixelwise_mul_cell_state2);
+    CLScheduler::get().enqueue(_accum_cell_state2);
+
+    if(_perform_cell_clipping)
+    {
+        CLScheduler::get().enqueue(_cell_clip);
+    }
+
+    _fully_connected_output.run();
+    CLScheduler::get().enqueue(_transpose_output1);
+    _gemm_output1.run();
+    CLScheduler::get().enqueue(_accum_output1);
+    CLScheduler::get().enqueue(_pixelwise_mul_output_state);
+
+    if(_run_peephole_opt)
+    {
+        CLScheduler::get().enqueue(_transpose_output2);
+        _gemm_output2.run();
+        _accum_output2.run();
+    }
+    CLScheduler::get().enqueue(_activation_output);
+
+    CLScheduler::get().enqueue(_activation_output_state);
+    CLScheduler::get().enqueue(_pixelwise_mul_output_state);
+
+    if(_has_projection_weights)
+    {
+        _fully_connected_output_state.run();
+        if(_perform_projection_clipping)
+        {
+            CLScheduler::get().enqueue(_projection_clip);
+        }
+    }
+
+    CLScheduler::get().enqueue(_copy_cell_state);
+    CLScheduler::get().enqueue(_copy_output);
+
+    _concat_scratch_buffer.run();
+
+    _memory_group.release();
+}
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index 9120aad..986fe00 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp

@@ -33,72 +33,120 @@
 
 using namespace arm_compute;
 
-CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
-      _is_first_run(false)
+namespace
 {
-}
-
-void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                      TensorShape &shape_wr, TensorShape &shape_im2col, TensorShape &shape_gemm)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON(!conv_info.padding_is_symmetric());
+    ARM_COMPUTE_UNUSED(output);
 
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
-        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 2);
-    }
+    const unsigned int kernel_width  = weights->dimension(0);
+    const unsigned int kernel_height = weights->dimension(1);
 
-    bool _has_bias = (biases != nullptr);
-    _is_first_run  = true;
-
-    // Get parameters for conv_info
-    unsigned int stride_x = 0;
-    unsigned int stride_y = 0;
-    unsigned int pad_x    = 0;
-    unsigned int pad_y    = 0;
-    std::tie(stride_x, stride_y) = conv_info.stride();
-    std::tie(pad_x, pad_y)       = conv_info.pad();
+    bool has_bias = (biases != nullptr);
 
     // Get convolved dimensions
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
                                                  conv_info);
 
-    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
-    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+    const size_t mat_weights_cols = weights->dimension(3);
+    const size_t mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + ((has_bias) ? 1 : 0);
+    const size_t mat_weights_num  = weights->dimension(4);
 
-    // Create tensor to store the reshaped weights
-    const size_t mat_weights_cols = weights->info()->dimension(3);
-    const size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
-    const size_t mat_weights_num  = weights->info()->dimension(4);
+    shape_wr = TensorShape(mat_weights_cols, mat_weights_rows, mat_weights_num);
 
-    const TensorShape shape_wr(mat_weights_cols, mat_weights_rows, mat_weights_num);
-
-    _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
-
-    // Create tensor to store im2col reshaped inputs
     const size_t mat_input_cols = mat_weights_rows;
     const size_t mat_input_rows = conv_w * conv_h;
-    TensorShape  shape_im2col   = input->info()->tensor_shape();
+
+    shape_im2col = input->tensor_shape();
     shape_im2col.set(0, mat_input_cols);
     shape_im2col.set(1, mat_input_rows);
     shape_im2col.set(2, 1);
 
-    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
-
-    // Create locally connected layer output tensor
-    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+    shape_gemm = shape_im2col;
     shape_gemm.set(0, mat_weights_cols);
     shape_gemm.set(1, mat_input_rows);
+}
+} // namespace
+
+CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
+      _is_first_run(false), _original_weights(nullptr)
+{
+}
+
+Status CLLocallyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2));
+    ARM_COMPUTE_RETURN_ERROR_ON(!conv_info.padding_is_symmetric());
+
+    bool has_bias = (biases != nullptr);
+
+    if(has_bias)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 2);
+    }
+
+    const unsigned int kernel_width  = weights->dimension(0);
+    const unsigned int kernel_height = weights->dimension(1);
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
+                                                 conv_info);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != conv_w) || (output->dimension(1) != conv_h), "Output shape does not match the expected one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+
+    // Calculate intermediate buffer shapes
+    TensorShape shape_wr;
+    TensorShape shape_im2col;
+    TensorShape shape_gemm;
+    calculate_shapes(input, weights, biases, output, conv_info, shape_wr, shape_im2col, shape_gemm);
+
+    TensorInfo weights_reshaped_info(shape_wr, 1, weights->data_type());
+    TensorInfo input_im2col_reshaped_info(shape_im2col, 1, input->data_type());
+    TensorInfo gemm_output_info(shape_gemm, 1, input->data_type());
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLWeightsReshapeKernel::validate(weights, biases, &weights_reshaped_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLLocallyConnectedMatrixMultiplyKernel::validate(&input_im2col_reshaped_info, &weights_reshaped_info, &gemm_output_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(&gemm_output_info, output, std::make_pair(conv_w, conv_h)));
+
+    return Status{};
+}
+
+void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_ERROR_THROW_ON(CLLocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info));
+
+    bool _has_bias    = (biases != nullptr);
+    _original_weights = weights;
+    _is_first_run     = true;
+
+    const unsigned int kernel_width  = weights->info()->dimension(0);
+    const unsigned int kernel_height = weights->info()->dimension(1);
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
+                                                 conv_info);
+
+    // Calculate intermediate buffer shapes
+    TensorShape shape_wr;
+    TensorShape shape_im2col;
+    TensorShape shape_gemm;
+    calculate_shapes(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info, shape_wr, shape_im2col, shape_gemm);
+
+    _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
+    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
     _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
 
     // Manage intermediate buffers
@@ -106,7 +154,7 @@
     _memory_group.manage(&_gemm_output);
 
     // Configure kernels
-    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(conv_w, conv_h), conv_info, _has_bias);
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
     _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
     _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
     _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
@@ -122,8 +170,13 @@
     // Run weights reshaping (Runs once for every configure)
     if(_is_first_run)
     {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
         _is_first_run = false;
         CLScheduler::get().enqueue(_weights_reshape_kernel);
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
     }
 
     _memory_group.acquire();

diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp
index 146856c..55b7649 100644
--- a/src/runtime/CL/functions/CLPermute.cpp
+++ b/src/runtime/CL/functions/CLPermute.cpp

@@ -39,6 +39,6 @@
 
 Status CLPermute::validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON(CLPermuteKernel::validate(input, output, perm));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(input, output, perm));
     return Status{};
-}
\ No newline at end of file
+}

diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 201bf87..17875a3 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp

@@ -41,13 +41,28 @@
     _kernel = std::move(k);
 
     // Configure border depending on operation required (quantize border in case of asymmetric data_type)
-    BorderMode border_mode = (PoolingType::MAX == pool_info.pool_type()) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-    PixelValue zero_value(0.f);
+    BorderMode border_mode{};
+    PixelValue pixel_value(0.f);
     if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding())
     {
-        zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+        pixel_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
     }
-    _border_handler.configure(input, _kernel->border_size(), border_mode, zero_value);
+    switch(input->info()->data_layout())
+    {
+        case DataLayout::NCHW:
+            border_mode = (PoolingType::MAX == pool_info.pool_type()) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+            break;
+        case DataLayout::NHWC:
+            border_mode = BorderMode::CONSTANT;
+            if(PoolingType::MAX == pool_info.pool_type() && !is_data_type_quantized_asymmetric(input->info()->data_type()))
+            {
+                pixel_value = PixelValue(std::numeric_limits<float>::lowest());
+            }
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data layout not supported");
+    }
+    _border_handler.configure(input, _kernel->border_size(), border_mode, pixel_value);
 }
 
 Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)

diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
index ed1f51c..a13859c 100644
--- a/src/runtime/CL/functions/CLQuantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 
 #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 using namespace arm_compute;
@@ -33,8 +34,21 @@
 {
 }
 
+Status CLQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+    TensorInfo min_max{ input->num_channels(), input->data_type() };
+    ARM_COMPUTE_RETURN_ON_ERROR(CLMinMaxLayerKernel::validate(input, &min_max));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayerKernel::validate(input, output, &min_max));
+
+    return Status{};
+}
+
 void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
     // Configure min-max kernel. _min_max tensor will be auto-configured within the kernel.
     _min_max_kernel.configure(input, &_min_max);
 

diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
new file mode 100644
index 0000000..4843ba6
--- /dev/null
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp

@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLRNNLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output()
+{
+}
+
+Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state,
+                            const ITensorInfo *output, const ActivationLayerInfo &info)
+{
+    const int idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const int idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) != recurrent_weights->dimension(idx_width));
+    ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) != recurrent_weights->dimension(1));
+    ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
+    ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
+    ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape());
+
+    auto shape_info = TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info, true, false));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info));
+
+    return Status{};
+}
+
+void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output,
+                           ActivationLayerInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+    ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info));
+
+    const int   idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    TensorShape shape      = compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+
+    _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+    _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+
+    // Manage intermediate buffers and configure
+    _memory_group.manage(&_fully_connected_out);
+    _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out, true, false);
+
+    _memory_group.manage(&_gemm_output);
+    _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
+
+    _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+    _memory_group.manage(&_add_output);
+
+    _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
+
+    _fully_connected_out.allocator()->allocate();
+    _gemm_output.allocator()->allocate();
+
+    _activation_kernel.configure(&_add_output, hidden_state, info);
+    _add_output.allocator()->allocate();
+
+    _copy_kernel.configure(hidden_state, output);
+}
+
+void CLRNNLayer::run()
+{
+    _memory_group.acquire();
+    _fully_connected_kernel.run();
+    _gemm_state_f.run();
+    CLScheduler::get().enqueue(_add_kernel);
+    CLScheduler::get().enqueue(_activation_kernel);
+
+    // copy hidden out to output
+    CLScheduler::get().enqueue(_copy_kernel);
+    _memory_group.release();
+}
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index d02afb4..3a5133d 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,19 +35,64 @@
 
 using namespace arm_compute;
 
+namespace
+{
+unsigned int calculate_number_of_stages(const ITensorInfo *input)
+{
+    // Calculate number of WGs. 16 elements per thread, 8 threads per WG
+    const unsigned int num_of_wg = ceil(input->dimension(0) / 128.f);
+
+    // Calculate number of stages. First stage performs op and the rest reduction sum
+    // depending on the size of the input. Last stage should have only 1 WG.
+    const unsigned int num_of_stages = num_of_wg / 128 + 2;
+
+    return num_of_stages;
+}
+} // namespace
+
 CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages()
 {
 }
 
+Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+{
+    const unsigned int num_of_stages = calculate_number_of_stages(input);
+
+    // Create temporary tensor infos
+    auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
+
+    // Create intermediate tensor info
+    TensorShape shape{ input->tensor_shape() };
+
+    for(unsigned int i = 0; i < num_of_stages - 1; i++)
+    {
+        shape.set(0, ceil(shape.x() / 128.f));
+        sums_vector[i].set_data_type(input->data_type());
+        sums_vector[i].set_tensor_shape(shape);
+        sums_vector[i].set_num_channels(input->num_channels());
+        sums_vector[i].set_fixed_point_position(input->fixed_point_position());
+    }
+
+    // Validate ReductionOperation only on first kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, sums_vector.get(), axis, op));
+
+    // Validate ReductionOperation on intermediate stages
+    for(unsigned int i = 1; i < num_of_stages - 1; ++i)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, op));
+    }
+
+    // Validate ReductionOperation on the last stage
+    const unsigned int last_stage = num_of_stages - 1;
+    ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, op));
+
+    return Status{};
+}
+
 void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
 {
-    // Calculate number of WGs. 16 elements per thread, 8 threads per WG
-    unsigned int num_of_wg = ceil(input->info()->dimension(0) / 128.f);
-
-    // Calculate number of stages. First stage performs op and the rest reduction sum
-    // depending on the size of the input. Last stage should have only 1 WG.
-    _num_of_stages = num_of_wg / 128 + 2;
+    _num_of_stages = calculate_number_of_stages(input->info());
 
     // Create temporary tensors
     _sums_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
@@ -95,4 +140,4 @@
     }
 
     _memory_group.release();
-}
\ No newline at end of file
+}

diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
new file mode 100644
index 0000000..d542781
--- /dev/null
+++ b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp

@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLWidthConcatenateLayer::CLWidthConcatenateLayer() // NOLINT
+    : _concat_kernels_vector(),
+      _num_inputs(0)
+{
+}
+
+Status CLWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output) // NOLINT
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
+
+    // Output auto inizialitation if not yet initialized
+    TensorInfo  tmp_output_info = *output->clone();
+    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+    auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type(), inputs_vector[0]->fixed_point_position());
+
+    unsigned int width_offset = 0;
+    for(const auto &input : inputs_vector)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info));
+        width_offset += input->dimension(0);
+    }
+
+    return Status{};
+}
+
+void CLWidthConcatenateLayer::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output) // NOLINT
+{
+    _num_inputs = inputs_vector.size();
+
+    std::vector<ITensorInfo *> inputs_vector_info;
+    for(unsigned int i = 0; i < _num_inputs; i++)
+    {
+        inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
+    }
+    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type(), inputs_vector[0]->info()->fixed_point_position());
+    ARM_COMPUTE_ERROR_THROW_ON(CLWidthConcatenateLayer::validate(inputs_vector_info, output->info()));
+
+    unsigned int width_offset = 0;
+
+    _concat_kernels_vector = arm_compute::support::cpp14::make_unique<CLWidthConcatenateLayerKernel[]>(_num_inputs);
+
+    for(unsigned int i = 0; i < _num_inputs; i++)
+    {
+        _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
+        width_offset += inputs_vector.at(i)->info()->dimension(0);
+    }
+}
+
+void CLWidthConcatenateLayer::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+
+    for(unsigned i = 0; i < _num_inputs; i++)
+    {
+        CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
+    }
+}

diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
new file mode 100644
index 0000000..49753ad
--- /dev/null
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp

@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims)
+{
+    Size2D output_tile = Size2D{};
+
+    if(kernel_dims == Size2D(3U, 3U))
+    {
+        output_tile = (input_dims.width <= 4 && input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
+    }
+    else if(kernel_dims == Size2D(5U, 5U))
+    {
+        output_tile = Size2D(4U, 4U);
+    }
+
+    return output_tile;
+}
+
+bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size)
+{
+    // Check if we want to configure a Winograd configuration which requires fast math
+    using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
+
+    std::vector<WinogradConfiguration> fast_math_winograd =
+    {
+        WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
+    };
+
+    auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
+                            std::pair<int, int>(kernel_size.width, kernel_size.height));
+
+    return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
+}
+} // namespace
+
+CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _activationlayer_function(), _input0(), _input1(), _batched_mm_output(),
+      _original_weights(nullptr), _is_prepared(false), _is_activationlayer_enabled(false)
+{
+}
+
+void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
+                                           bool enable_fast_math)
+{
+    // Get indices for the width and height
+    const size_t idx_width  = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+
+    // Input shape, kernel size and output tile
+    const Size2D input_dims  = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+    const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+    const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
+
+    // Check if the Winograd configuration requires fast math
+    if(!enable_fast_math)
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+    }
+
+    const WinogradInfo winograd_info = WinogradInfo(output_tile,
+                                                    kernel_size,
+                                                    input_dims,
+                                                    conv_info,
+                                                    input->info()->data_layout());
+
+    _is_prepared      = false;
+    _original_weights = weights;
+
+    // Manage intermediate tensors
+    _memory_group.manage(&_input0);
+    _memory_group.manage(&_batched_mm_output);
+
+    // Do not manage _input1 as it contains the weights
+
+    // Configure input transform
+    _input_transform.configure(input, &_input0, winograd_info);
+
+    // Configure filter transform
+    _filter_transform.configure(weights, &_input1, winograd_info);
+
+    // Configure batched matrix multiply
+    _batched_mm.configure(&_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/));
+
+    // Configure output transform
+    _output_transform.configure(&_batched_mm_output, biases, output, winograd_info);
+
+    // Configure activation layer
+    _is_activationlayer_enabled = act_info.enabled();
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.configure(output, nullptr, act_info);
+    }
+
+    // Allocate temporary tensors
+    _input0.allocator()->allocate();
+    _batched_mm_output.allocator()->allocate();
+}
+
+Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                            const ActivationLayerInfo &act_info, bool enable_fast_math)
+{
+    // Get indeces for the width and height
+    const size_t idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+    // Input shape, kernel size and output tile
+    const Size2D input_dims  = Size2D(input->tensor_shape()[idx_width], input->tensor_shape()[idx_height]);
+    const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
+    const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
+
+    // Check if the Winograd configuration requires fast math
+    if(!enable_fast_math)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+    }
+
+    const WinogradInfo winograd_info = WinogradInfo(output_tile,
+                                                    kernel_size,
+                                                    input_dims,
+                                                    conv_info,
+                                                    input->data_layout());
+
+    // Validate input transform
+    const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
+    const TensorInfo  input0       = input->clone()->set_tensor_shape(input0_shape);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradInputTransform::validate(input, &input0, winograd_info));
+
+    // Validate filter transform
+    const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
+    const TensorInfo  input1       = weights->clone()->set_tensor_shape(input1_shape);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradFilterTransformKernel::validate(weights, &input1, winograd_info));
+
+    // Validate batched matrix multiply
+    TensorShape batched_mm_output_shape = input0.tensor_shape();
+    batched_mm_output_shape[0]          = input1.tensor_shape()[0];
+    const TensorInfo batched_mm_output  = input0.clone()->set_tensor_shape(batched_mm_output_shape);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/)));
+
+    // Configure output transform
+    ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradOutputTransformKernel::validate(&batched_mm_output, biases, output, winograd_info));
+
+    // Validate Activation Layer
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+    }
+
+    return Status{};
+}
+
+void CLWinogradConvolutionLayer::run()
+{
+    prepare();
+
+    _memory_group.acquire();
+
+    // Run input transform
+    _input_transform.run();
+
+    // Run batched matrix multiplication
+    _batched_mm.run();
+
+    // Run output transform
+    CLScheduler::get().enqueue(_output_transform);
+
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.run();
+    }
+
+    _memory_group.release();
+}
+
+void CLWinogradConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        // Run filter transform and mark original weights as unused
+        _input1.allocator()->allocate();
+        CLScheduler::get().enqueue(_filter_transform, false);
+        _original_weights->mark_as_unused();
+
+        // Prepare GEMM and release reshaped weights if marked unused by CLGEMM
+        _batched_mm.prepare();
+        if(!_input1.is_used())
+        {
+            _input1.allocator()->free();
+        }
+
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
+}

diff --git a/src/runtime/CL/functions/CLWinogradInputTransform.cpp b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
new file mode 100644
index 0000000..09e8456
--- /dev/null
+++ b/src/runtime/CL/functions/CLWinogradInputTransform.cpp

@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h"
+#include "arm_compute/core/Error.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLWinogradInputTransform::configure(ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLWinogradInputTransformKernel>();
+    k->configure(input, output, winograd_info);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+}
+
+Status CLWinogradInputTransform::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradInputTransformKernel::validate(input, output, winograd_info));
+    return Status{};
+}

diff --git a/src/runtime/CL/tuners/BifrostTuner.cpp b/src/runtime/CL/tuners/BifrostTuner.cpp
new file mode 100644
index 0000000..c0ebd24
--- /dev/null
+++ b/src/runtime/CL/tuners/BifrostTuner.cpp

@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/tuners/BifrostTuner.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernels.h"
+#include "arm_compute/core/utils/misc/Cast.h"
+
+namespace arm_compute
+{
+namespace tuners
+{
+namespace
+{
+/** Tunes a @ref CLDirectConvolutionLayerKernel for a bifrost target
+ *
+ * @param[in] k Kernels to tune
+ */
+void tune_direct_convolution_kernel(CLDirectConvolutionLayerKernel &k)
+{
+    cl::NDRange lws_hint = k.lws_hint();
+
+    const GPUTarget    gpu_target    = k.get_target();
+    const DataType     dt            = k._input->info()->data_type();
+    const TensorShape  weights_shape = k._weights->info()->tensor_shape();
+    const TensorShape  inputs_shape  = k._input->info()->tensor_shape();
+    const size_t       kernel_size   = weights_shape.x();
+    const unsigned int stride_x      = k._conv_stride_x;
+    const unsigned int stride_y      = k._conv_stride_y;
+
+    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72) && (kernel_size <= 5) && (stride_x == 1) && (stride_y == 1) && (dt == DataType::F32))
+    {
+        // Through extensive experimentation with over 30 representative tensor
+        // shapes, we found a small number of local work size configurations
+        // that result in nearly optimal execution times. Selecting the right
+        // lws for a given shape, however, required a complex decision tree,
+        // until we constructed a simple feature as described below.
+        //
+        // We started from the number of multiply-accumulate operations for a
+        // convolution layer, which is equal to the product of the input
+        // dimensions 0..2 and the weights dimensions 0..2.  Unfortunately,
+        // this resulted in ties between distinct shapes that required distinct
+        // lws configurations. Replacing the width of the input with the kernel
+        // size, however, resulted in nearly optimal predictions. We use underscores
+        // in variable names to indicate when they are intentionally misleading.
+        const size_t product_of_weights_dimensions = weights_shape[0] * weights_shape[1] * weights_shape[2];
+        const size_t product_of_input_dimensions_  = inputs_shape[0] * inputs_shape[1] * inputs_shape[2];
+        const float  mega_ops_                     = 1e-6 * product_of_weights_dimensions * product_of_input_dimensions_;
+
+        switch(kernel_size)
+        {
+            case 1:
+            {
+                if(mega_ops_ < 1.f)
+                {
+                    lws_hint = cl::NDRange(1, 1, 8);
+                }
+                else if(mega_ops_ < 7.f)
+                {
+                    lws_hint = cl::NDRange(1, 1, 4);
+                }
+                else
+                {
+                    lws_hint = cl::NDRange(1, 1, 2);
+                }
+                break;
+            }
+            case 3:
+            {
+                if(mega_ops_ < 1.f)
+                {
+                    lws_hint = cl::NDRange(1, 1, 8);
+                }
+                else if(mega_ops_ < 13.f)
+                {
+                    lws_hint = cl::NDRange(2, 1, 4);
+                }
+                else if(mega_ops_ < 50.f)
+                {
+                    lws_hint = cl::NDRange(3, 1, 4);
+                }
+                else
+                {
+                    lws_hint = cl::NDRange(2, 1, 6);
+                }
+                break;
+            }
+            case 5:
+            {
+                if(mega_ops_ < 2.f || mega_ops_ > 80.f)
+                {
+                    lws_hint = cl::NDRange(2, 1, 4);
+                }
+                else
+                {
+                    lws_hint = cl::NDRange(2, 1, 8);
+                }
+                break;
+            }
+            default:
+                break;
+        }
+        k.set_lws_hint(lws_hint);
+    }
+}
+} // namespace
+
+void BifrostTuner::tune_kernel_static(ICLKernel &kernel)
+{
+    // Continue on tuning if dynamic tuning
+    if(dynamic_cast<CLDirectConvolutionLayerKernel *>(&kernel) != nullptr)
+    {
+        tune_direct_convolution_kernel(*utils::cast::polymorphic_downcast<CLDirectConvolutionLayerKernel *>(&kernel));
+    }
+}
+
+void BifrostTuner::tune_kernel_dynamic(ICLKernel &kernel)
+{
+    ARM_COMPUTE_UNUSED(kernel);
+}
+} // namespace tuners
+} // namespace arm_compute
\ No newline at end of file
commit	b3a371bc429d2ba45e56baaf239d8200c2662a74	[log] [tgz]
author	Jenkins <bsgcomp@arm.com>	Wed May 23 11:36:53 2018 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	Wed May 23 14:55:11 2018 +0100
tree	554525e415c303d64a08722a755397852ebbb8e4
parent	67c8c91522e5be8156b77f57e63c0253535c902a [diff]