arm_compute v17.06
diff --git a/src/core/AccessWindowAutoPadding.cpp b/src/core/AccessWindowAutoPadding.cpp
index 3c32e2b..b75ebcf 100644
--- a/src/core/AccessWindowAutoPadding.cpp
+++ b/src/core/AccessWindowAutoPadding.cpp
@@ -23,12 +23,12 @@
  */
 #include "arm_compute/core/AccessWindowAutoPadding.h"
 
-#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Window.h"
 
 using namespace arm_compute;
 
-AccessWindowAutoPadding::AccessWindowAutoPadding(TensorInfo *info)
+AccessWindowAutoPadding::AccessWindowAutoPadding(ITensorInfo *info)
     : _info(info)
 {
 }
diff --git a/src/core/AccessWindowStatic.cpp b/src/core/AccessWindowStatic.cpp
index d3eb666..8b6419c 100644
--- a/src/core/AccessWindowStatic.cpp
+++ b/src/core/AccessWindowStatic.cpp
@@ -24,12 +24,12 @@
 #include "arm_compute/core/AccessWindowStatic.h"
 
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Window.h"
 
 using namespace arm_compute;
 
-AccessWindowStatic::AccessWindowStatic(TensorInfo *info, int start_x, int start_y, int end_x, int end_y)
+AccessWindowStatic::AccessWindowStatic(ITensorInfo *info, int start_x, int start_y, int end_x, int end_y)
     : _info(info), _start_x(start_x), _start_y(start_y), _end_x(end_x), _end_y(end_y)
 {
 }
@@ -39,7 +39,7 @@
     ARM_COMPUTE_UNUSED(border_undefined);
     ARM_COMPUTE_UNUSED(border_size);
 
-    return compute_valid_region(window, std::move(input_valid_region));
+    return compute_valid_region(window, input_valid_region);
 }
 
 ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, ValidRegion input_valid_region) const
@@ -55,12 +55,18 @@
     // Start of the valid region is equal to the start of the static access but
     // never outside of the tensor.
     anchor.set(0, std::max<int>(0, _start_x));
-    anchor.set(1, std::max<int>(0, _start_y));
+    if(_info->num_dimensions() > 1)
+    {
+        anchor.set(1, std::max<int>(0, _start_y));
+    }
 
     // End of the valid region is equal to the end of the static access but
     // never outside of the tensor.
     shape.set(0, std::min<int>(_end_x, _info->tensor_shape()[0]));
-    shape.set(1, std::min<int>(_end_y, _info->tensor_shape()[1]));
+    if(_info->num_dimensions() > 1)
+    {
+        shape.set(1, std::min<int>(_end_y, _info->tensor_shape()[1]));
+    }
 
     // For higher dimension use the intersection of the window size and the
     // valid region of the input
diff --git a/src/core/AccessWindowTranspose.cpp b/src/core/AccessWindowTranspose.cpp
index 45d4062..b3605c4 100644
--- a/src/core/AccessWindowTranspose.cpp
+++ b/src/core/AccessWindowTranspose.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/core/AccessWindowTranspose.h"
 
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Window.h"
 
 using namespace arm_compute;
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 76821c6..21b72dd 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -22,10 +22,48 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/CL/CLHelpers.h"
-
+#include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Types.h"
 
+#include <map>
+#include <vector>
+
+namespace
+{
+arm_compute::GPUTarget get_bifrost_target(const std::string &name)
+{
+    arm_compute::GPUTarget target = arm_compute::GPUTarget::MIDGARD;
+
+    if(name == "G7")
+    {
+        target = arm_compute::GPUTarget::G70;
+    }
+
+    return target;
+}
+
+arm_compute::GPUTarget get_midgard_target(const std::string &name)
+{
+    arm_compute::GPUTarget target = arm_compute::GPUTarget::MIDGARD;
+
+    if(name == "T6")
+    {
+        target = arm_compute::GPUTarget::T600;
+    }
+    else if(name == "T7")
+    {
+        target = arm_compute::GPUTarget::T700;
+    }
+    else if(name == "T8")
+    {
+        target = arm_compute::GPUTarget::T800;
+    }
+
+    return target;
+}
+} // namespace
+
 namespace arm_compute
 {
 std::string get_cl_type_from_data_type(const DataType &dt)
@@ -57,4 +95,71 @@
             return "";
     }
 }
+
+const std::string &string_from_target(GPUTarget target)
+{
+    static std::map<GPUTarget, const std::string> gpu_target_map =
+    {
+        { GPUTarget::MIDGARD, "midgard" },
+        { GPUTarget::BIFROST, "bifrost" },
+        { GPUTarget::T600, "t600" },
+        { GPUTarget::T700, "t700" },
+        { GPUTarget::T800, "t800" },
+        { GPUTarget::G70, "g70" }
+    };
+
+    return gpu_target_map[target];
+}
+
+GPUTarget get_target_from_device(cl::Device &device)
+{
+    const std::string name_mali("Mali-");
+    GPUTarget         target{ GPUTarget::MIDGARD };
+
+    size_t            name_size = 0;
+    std::vector<char> name;
+
+    // Query device name size
+    cl_int err = clGetDeviceInfo(device.get(), CL_DEVICE_NAME, 0, nullptr, &name_size);
+    ARM_COMPUTE_ERROR_ON_MSG((err != 0) || (name_size == 0), "clGetDeviceInfo failed to return valid information");
+    // Resize vector
+    name.resize(name_size);
+    // Query device name
+    err = clGetDeviceInfo(device.get(), CL_DEVICE_NAME, name_size, name.data(), nullptr);
+    ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information");
+    ARM_COMPUTE_UNUSED(err);
+
+    std::string name_str(name.begin(), name.end());
+    auto        pos = name_str.find(name_mali);
+
+    if(pos != std::string::npos)
+    {
+        ARM_COMPUTE_ERROR_ON_MSG((pos + name_mali.size() + 2) > name_str.size(), "Device name is shorter than expected.");
+        std::string sub_name = name_str.substr(pos + name_mali.size(), 2);
+
+        if(sub_name[0] == 'G')
+        {
+            target = get_bifrost_target(sub_name);
+        }
+        else if(sub_name[0] == 'T')
+        {
+            target = get_midgard_target(sub_name);
+        }
+        else
+        {
+            ARM_COMPUTE_INFO("Mali GPU unknown. Target is set to the default one.");
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_INFO("Can't find valid Mali GPU. Target is set to the default one.");
+    }
+
+    return target;
+}
+
+GPUTarget get_arch_from_target(GPUTarget target)
+{
+    return (target & GPUTarget::GPU_ARCH_MASK);
+}
 } // namespace arm_compute
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index bc12aad..15a5d90 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -124,6 +124,7 @@
     { "channel_extract_YUYV422", "channel_extract.cl" },
     { "combine_gradients_L1", "canny.cl" },
     { "combine_gradients_L2", "canny.cl" },
+    { "concatenate_depth", "concatenate.cl" },
     { "convolution_rectangle", "convolution_rectangle.cl" },
     { "col2im", "convolution_layer.cl" },
     { "convolution3x3_static", "convolution3x3.cl" },
@@ -159,9 +160,11 @@
     { "gemm_ma_f32", "gemm.cl" },
     { "gemm_mm_u8", "gemm.cl" },
     { "gemm_mm_f16", "gemm.cl" },
-    { "gemm_mm_f32", "gemm.cl" },
+    { "gemm_mm_f32_midgard", "gemm.cl" },
+    { "gemm_mm_f32_bifrost", "gemm.cl" },
     { "gemm_vm_f16", "gemm.cl" },
     { "gemm_vm_f32", "gemm.cl" },
+    { "gemm_lc_vm_f32", "gemm.cl" },
     { "gemm_transpose1x16_u8", "gemm.cl" },
     { "gemm_transpose1x8_f16", "gemm.cl" },
     { "gemm_transpose1x4_f32", "gemm.cl" },
@@ -172,6 +175,9 @@
     { "hist_border_kernel_fixed", "histogram.cl" },
     { "hist_local_kernel", "histogram.cl" },
     { "hist_local_kernel_fixed", "histogram.cl" },
+    { "hog_block_normalization", "hog.cl" },
+    { "hog_detector", "hog.cl" },
+    { "hog_orientation_binning", "hog.cl" },
     { "hysteresis", "canny.cl" },
     { "im2col_generic", "convolution_layer.cl" },
     { "im2col_reduced", "convolution_layer.cl" },
@@ -199,7 +205,8 @@
     { "non_linear_filter_disk5x5", "non_linear_filter5x5.cl" },
     { "non_max_suppression", "nonmax.cl" },
     { "normalization_layer_cross_map", "normalization_layer.cl" },
-    { "normalization_layer_in_map", "normalization_layer.cl" },
+    { "normalization_layer_in_map_1D", "normalization_layer.cl" },
+    { "batchnormalization_layer", "batchnormalization_layer.cl" },
     { "NV12_to_IYUV_bt709", "color_convert.cl" },
     { "NV12_to_RGB888_bt709", "color_convert.cl" },
     { "NV12_to_RGBA8888_bt709", "color_convert.cl" },
@@ -290,6 +297,10 @@
 #include "./cl_kernels/channel_extract.clembed"
     },
     {
+        "concatenate.cl",
+#include "./cl_kernels/concatenate.clembed"
+    },
+    {
         "color_convert.cl",
 #include "./cl_kernels/color_convert.clembed"
     },
@@ -362,6 +373,10 @@
 #include "./cl_kernels/histogram.clembed"
     },
     {
+        "hog.cl",
+#include "./cl_kernels/hog.clembed"
+    },
+    {
         "integral_image.cl",
 #include "./cl_kernels/integral_image.clembed"
     },
@@ -398,6 +413,10 @@
 #include "./cl_kernels/normalization_layer.clembed"
     },
     {
+        "batchnormalization_layer.cl",
+#include "./cl_kernels/batchnormalization_layer.clembed"
+    },
+    {
         "optical_flow_pyramid_lk.cl",
 #include "./cl_kernels/optical_flow_pyramid_lk.clembed"
     },
diff --git a/src/core/CL/ICLHOG.cpp b/src/core/CL/ICLHOG.cpp
new file mode 100644
index 0000000..e182997
--- /dev/null
+++ b/src/core/CL/ICLHOG.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLHOG.h"
+
+using namespace arm_compute;
+
+ICLHOG::ICLHOG()
+    : _mapping(nullptr)
+{
+}
+
+void ICLHOG::map(cl::CommandQueue &q, bool blocking)
+{
+    _mapping = do_map(q, blocking);
+}
+
+void ICLHOG::unmap(cl::CommandQueue &q)
+{
+    do_unmap(q);
+    _mapping = nullptr;
+}
+
+float *ICLHOG::descriptor() const
+{
+    return reinterpret_cast<float *>(_mapping);
+}
\ No newline at end of file
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index db6212f..7ac0fe3 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/CL/ICLKernel.h"
 
+#include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -30,6 +31,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
 
 #include <cstddef>
 
@@ -59,7 +61,7 @@
 }
 
 ICLKernel::ICLKernel()
-    : _kernel(nullptr), _lws_hint(cl::Range_128_1)
+    : _kernel(nullptr), _lws_hint(cl::Range_128_1), _target(CLScheduler::get().target())
 {
 }
 
@@ -78,10 +80,9 @@
 void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, const Window &window)
 {
     ARM_COMPUTE_ERROR_ON(tensor == nullptr);
-    ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(window, tensor->info()->num_dimensions());
 
-    const TensorInfo *info    = tensor->info();
-    const Strides    &strides = info->strides_in_bytes();
+    const ITensorInfo *info    = tensor->info();
+    const Strides     &strides = info->strides_in_bytes();
 
     // Calculate offset to the start of the window
     unsigned int offset_first_element = info->offset_first_element_in_bytes();
@@ -136,3 +137,18 @@
 {
     return num_arguments_per_tensor<3>();
 }
+
+void ICLKernel::set_target(cl::Device &device)
+{
+    _target = get_target_from_device(device);
+}
+
+void ICLKernel::set_target(GPUTarget target)
+{
+    _target = target;
+}
+
+GPUTarget ICLKernel::get_target() const
+{
+    return _target;
+}
diff --git a/src/core/CL/ICLMultiHOG.cpp b/src/core/CL/ICLMultiHOG.cpp
new file mode 100644
index 0000000..8ece566
--- /dev/null
+++ b/src/core/CL/ICLMultiHOG.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLMultiHOG.h"
+
+#include "arm_compute/core/IHOG.h"
+
+using namespace arm_compute;
+
+IHOG *ICLMultiHOG::model(size_t index)
+{
+    return cl_model(index);
+}
+
+const IHOG *ICLMultiHOG::model(size_t index) const
+{
+    return cl_model(index);
+}
\ No newline at end of file
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index 80fcfb5..3b8dfd2 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -51,6 +51,8 @@
 using clCreateBuffer_func            = cl_mem (*)(cl_context, cl_mem_flags, size_t, void *, cl_int *);
 using clCreateProgramWithSource_func = cl_program (*)(cl_context, cl_uint, const char **, const size_t *, cl_int *);
 using clReleaseKernel_func           = cl_int (*)(cl_kernel kernel);
+using clGetDeviceInfo_func           = cl_int (*)(cl_device_id, cl_device_info, size_t, void *, size_t *);
+using clGetDeviceIDs_func            = cl_int (*)(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *);
 
 class CLSymbols
 {
@@ -88,6 +90,8 @@
             clRetainCommandQueue      = reinterpret_cast<clRetainCommandQueue_func>(dlsym(handle, "clRetainCommandQueue"));
             clEnqueueUnmapMemObject   = reinterpret_cast<clEnqueueUnmapMemObject_func>(dlsym(handle, "clEnqueueUnmapMemObject"));
             clReleaseMemObject        = reinterpret_cast<clReleaseMemObject_func>(dlsym(handle, "clReleaseMemObject"));
+            clGetDeviceInfo           = reinterpret_cast<clGetDeviceInfo_func>(dlsym(handle, "clGetDeviceInfo"));
+            clGetDeviceIDs            = reinterpret_cast<clGetDeviceIDs_func>(dlsym(handle, "clGetDeviceIDs"));
             dlclose(handle);
         }
     }
@@ -123,6 +127,8 @@
     clRetainCommandQueue_func      clRetainCommandQueue      = nullptr;
     clEnqueueUnmapMemObject_func   clEnqueueUnmapMemObject   = nullptr;
     clReleaseMemObject_func        clReleaseMemObject        = nullptr;
+    clGetDeviceInfo_func           clGetDeviceInfo           = nullptr;
+    clGetDeviceIDs_func            clGetDeviceIDs            = nullptr;
 };
 
 bool arm_compute::opencl_is_available()
@@ -544,3 +550,37 @@
         return CL_OUT_OF_RESOURCES;
     }
 }
+
+cl_int clGetDeviceIDs(cl_platform_id platform,
+                      cl_device_type device_type,
+                      cl_uint        num_entries,
+                      cl_device_id *devices,
+                      cl_uint       *num_devices)
+{
+    auto func = CLSymbols::get().clGetDeviceIDs;
+    if(func != nullptr)
+    {
+        return func(platform, device_type, num_entries, devices, num_devices);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clGetDeviceInfo(cl_device_id   device,
+                       cl_device_info param_name,
+                       size_t         param_value_size,
+                       void          *param_value,
+                       size_t        *param_value_size_ret)
+{
+    auto func = CLSymbols::get().clGetDeviceInfo;
+    if(func != nullptr)
+    {
+        return func(device, param_name, param_value_size, param_value, param_value_size_ret);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/activation_layer.cl
index 23142ad..e3cbb6c 100644
--- a/src/core/CL/cl_kernels/activation_layer.cl
+++ b/src/core/CL/cl_kernels/activation_layer.cl
@@ -35,21 +35,25 @@
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
  * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
  * @param[out] output_ptr                           Pointer to the destination image. Supported data types: F16, F32
  * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
  * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
  */
 __kernel void activation_layer(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
 {
     // Get pixels pointer
-    Image input  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image output = CONVERT_TO_IMAGE_STRUCT(output);
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 
     // Load data
     VEC_DATA_TYPE(DATA_TYPE, 16)
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
new file mode 100644
index 0000000..13e6702
--- /dev/null
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Apply batch normalization.
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: F32
+ * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: F32
+ * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
+ * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: F32
+ * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: F32
+ * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
+ * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
+ */
+__kernel void batchnormalization_layer(TENSOR3D_DECLARATION(input),
+                                       TENSOR3D_DECLARATION(output),
+                                       VECTOR_DECLARATION(mean),
+                                       VECTOR_DECLARATION(var),
+                                       VECTOR_DECLARATION(beta),
+                                       VECTOR_DECLARATION(gamma),
+                                       float epsilon)
+{
+    Tensor3D in    = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Vector   mean  = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector   var   = CONVERT_TO_VECTOR_STRUCT(var);
+    Vector   beta  = CONVERT_TO_VECTOR_STRUCT(beta);
+    Vector   gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+
+    float4 _in         = 0;
+    float4 denominator = 0;
+    float4 numerator   = 0;
+    float4 x_bar       = 0;
+    float4 gamma_vec   = 0;
+    float4 beta_vec    = 0;
+
+    const int current_slice = get_global_id(2);
+
+    _in         = vload4(0, (__global float *)in.ptr);
+    denominator = *((__global float *)(var.ptr + current_slice * var.stride_x));
+    denominator = rsqrt(denominator + epsilon);
+
+    // Calculate x bar and store results
+    numerator = *((__global float *)(mean.ptr + current_slice * mean.stride_x));
+    numerator = _in - numerator;
+    x_bar     = numerator * denominator;
+
+    gamma_vec = *((__global float *)(gamma.ptr + current_slice * beta.stride_x));
+    beta_vec  = *((__global float *)(beta.ptr + current_slice * beta.stride_x));
+
+    vstore4(gamma_vec * x_bar + beta_vec, 0, (__global float *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
new file mode 100644
index 0000000..00f5189
--- /dev/null
+++ b/src/core/CL/cl_kernels/concatenate.cl
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This kernel concatenates the input tensor into the output tensor along the third dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  offset                            The offset to the first valid element of the output tensor in bytes
+ */
+__kernel void concatenate_depth(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    unsigned int offset)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    float4 source_values = vload4(0, (__global float *)src.ptr);
+
+    vstore4(source_values, 0, (__global float *)(dst.ptr + offset));
+}
diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/convolution_layer.cl
index 4554dd0..bd5dfaf 100644
--- a/src/core/CL/cl_kernels/convolution_layer.cl
+++ b/src/core/CL/cl_kernels/convolution_layer.cl
@@ -44,36 +44,53 @@
  * @param[in]  bias_ptr                           Pointer to the bias tensor. Same as input
  * @param[in]  bias_stride_x                      Stride of the bias tensor in X dimension (in bytes)
  * @param[in]  bias_step_x                        bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_stride_y                      Stride of the bias tensor in Y dimension (in bytes)
- * @param[in]  bias_step_y                        bias_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  bias_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[in]  width                              The width of the input tensor
  * @param[in]  height                             The height of the input tensor
+ * @param[in]  depth                              The depth of the input tensor
+ * @param[in]  total_filters                      Total number of filters. 4th dimension of the weights matrix
  */
 __kernel void reshape_to_columns(
     TENSOR3D_DECLARATION(src),
     IMAGE_DECLARATION(dst),
 #if defined HAS_BIAS
-    IMAGE_DECLARATION(bias),
+    VECTOR_DECLARATION(bias),
 #endif
-    uint width, uint height)
+    uint width, uint height, uint depth, uint total_filters)
 {
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D src            = CONVERT_TO_TENSOR3D_STRUCT(src);
+    bool     is_last_thread = (get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1));
 
-    __global uchar *tmp_out_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_y
-                                  + get_global_id(1) * width * dst_stride_y + get_global_id(2) * width * height * dst_stride_y;
+    __global uchar *tmp_src_ptr = src.ptr;
+    __global uchar *tmp_dst_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_y + get_global_id(1) * width * dst_stride_y + get_global_id(
+                                      2) * width * height * dst_stride_y;
+#if defined         HAS_BIAS
+    __global uchar *tmp_bias_ptr = bias_ptr + bias_offset_first_element_in_bytes;
+#endif
 
-    *((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)src.ptr);
+    if(is_last_thread)
+    {
+        for(uint i = 0; i < total_filters; ++i)
+        {
+            *((__global DATA_TYPE *)tmp_dst_ptr) = *((__global DATA_TYPE *)tmp_src_ptr);
 
 #if defined HAS_BIAS
-    // If it is the last thread in the 3 dimensional workgroup
-    if(get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1))
-    {
-        tmp_out_ptr += dst_stride_y;
-        *((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)(bias_ptr + bias_offset_first_element_in_bytes));
-    }
-
+            *((__global DATA_TYPE *)(tmp_dst_ptr + dst_stride_y)) = *((__global DATA_TYPE *)(tmp_bias_ptr));
+            tmp_bias_ptr += bias_stride_x;
 #endif
+            tmp_src_ptr += depth * src_stride_z;
+            tmp_dst_ptr += dst_stride_x;
+        }
+    }
+    else
+    {
+        for(uint i = 0; i < total_filters; ++i)
+        {
+            *((__global DATA_TYPE *)tmp_dst_ptr) = *((__global DATA_TYPE *)tmp_src_ptr);
+            tmp_src_ptr += depth * src_stride_z;
+            tmp_dst_ptr += dst_stride_x;
+        }
+    }
 }
 
 /** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index db849f5..caf6e3f 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -408,7 +408,8 @@
 }
 #endif
 
-/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+#if(defined WIDTH_MATRIX_B && defined ALPHA)
+/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication
  *
  * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
@@ -432,10 +433,9 @@
  * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
  */
-#if(defined WIDTH_MATRIX_B && defined ALPHA)
-__kernel void gemm_mm_f32(IMAGE_DECLARATION(src0),
-                          IMAGE_DECLARATION(src1),
-                          IMAGE_DECLARATION(dst))
+__kernel void gemm_mm_f32_midgard(IMAGE_DECLARATION(src0),
+                                  IMAGE_DECLARATION(src1),
+                                  IMAGE_DECLARATION(dst))
 {
     /* src_addr.s0 = address of matrix A */
     /* src_addr.s1 = address of matrix B */
@@ -508,6 +508,216 @@
     vstore4(c30, 0, (__global float *)(offset(&dst, 0, 3)));
 }
 
+/** This OpenCL kernel is optimised for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F32
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_mm_f32_bifrost(IMAGE_DECLARATION(src0),
+                                  IMAGE_DECLARATION(src1),
+                                  IMAGE_DECLARATION(dst))
+{
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    __global float *src_addr_a = (__global float *)(src0_ptr + get_global_id(1) * src0_stride_y + src0_offset_first_element_in_bytes);
+    __global float *src_addr_b = (__global float *)(src1_ptr + get_global_id(0) * src1_stride_y + src1_offset_first_element_in_bytes);
+
+    // Compute end row address for matrix B
+    __global float *src_end_addr_b = src_addr_b + WIDTH_MATRIX_B;
+
+    // Reset accumulators
+    float c00 = 0.0f;
+    float c01 = 0.0f;
+    float c02 = 0.0f;
+    float c03 = 0.0f;
+    float c10 = 0.0f;
+    float c11 = 0.0f;
+    float c12 = 0.0f;
+    float c13 = 0.0f;
+    float c20 = 0.0f;
+    float c21 = 0.0f;
+    float c22 = 0.0f;
+    float c23 = 0.0f;
+    float c30 = 0.0f;
+    float c31 = 0.0f;
+    float c32 = 0.0f;
+    float c33 = 0.0f;
+
+    for(; src_addr_b <= (src_end_addr_b - 16); src_addr_a += 16, src_addr_b += 16)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
+
+        c00 = fma(a0.s0, b0.s0, c00);
+        c01 = fma(a0.s0, b0.s1, c01);
+        c02 = fma(a0.s0, b0.s2, c02);
+        c03 = fma(a0.s0, b0.s3, c03);
+
+        c10 = fma(a0.s1, b0.s0, c10);
+        c11 = fma(a0.s1, b0.s1, c11);
+        c12 = fma(a0.s1, b0.s2, c12);
+        c13 = fma(a0.s1, b0.s3, c13);
+
+        c20 = fma(a0.s2, b0.s0, c20);
+        c21 = fma(a0.s2, b0.s1, c21);
+        c22 = fma(a0.s2, b0.s2, c22);
+        c23 = fma(a0.s2, b0.s3, c23);
+
+        c30 = fma(a0.s3, b0.s0, c30);
+        c31 = fma(a0.s3, b0.s1, c31);
+        c32 = fma(a0.s3, b0.s2, c32);
+        c33 = fma(a0.s3, b0.s3, c33);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a + 4);
+        b0 = vload4(0, src_addr_b + 4);
+
+        c00 = fma(a0.s0, b0.s0, c00);
+        c01 = fma(a0.s0, b0.s1, c01);
+        c02 = fma(a0.s0, b0.s2, c02);
+        c03 = fma(a0.s0, b0.s3, c03);
+
+        c10 = fma(a0.s1, b0.s0, c10);
+        c11 = fma(a0.s1, b0.s1, c11);
+        c12 = fma(a0.s1, b0.s2, c12);
+        c13 = fma(a0.s1, b0.s3, c13);
+
+        c20 = fma(a0.s2, b0.s0, c20);
+        c21 = fma(a0.s2, b0.s1, c21);
+        c22 = fma(a0.s2, b0.s2, c22);
+        c23 = fma(a0.s2, b0.s3, c23);
+
+        c30 = fma(a0.s3, b0.s0, c30);
+        c31 = fma(a0.s3, b0.s1, c31);
+        c32 = fma(a0.s3, b0.s2, c32);
+        c33 = fma(a0.s3, b0.s3, c33);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a + 8);
+        b0 = vload4(0, src_addr_b + 8);
+
+        c00 = fma(a0.s0, b0.s0, c00);
+        c01 = fma(a0.s0, b0.s1, c01);
+        c02 = fma(a0.s0, b0.s2, c02);
+        c03 = fma(a0.s0, b0.s3, c03);
+
+        c10 = fma(a0.s1, b0.s0, c10);
+        c11 = fma(a0.s1, b0.s1, c11);
+        c12 = fma(a0.s1, b0.s2, c12);
+        c13 = fma(a0.s1, b0.s3, c13);
+
+        c20 = fma(a0.s2, b0.s0, c20);
+        c21 = fma(a0.s2, b0.s1, c21);
+        c22 = fma(a0.s2, b0.s2, c22);
+        c23 = fma(a0.s2, b0.s3, c23);
+
+        c30 = fma(a0.s3, b0.s0, c30);
+        c31 = fma(a0.s3, b0.s1, c31);
+        c32 = fma(a0.s3, b0.s2, c32);
+        c33 = fma(a0.s3, b0.s3, c33);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a + 12);
+        b0 = vload4(0, src_addr_b + 12);
+
+        c00 = fma(a0.s0, b0.s0, c00);
+        c01 = fma(a0.s0, b0.s1, c01);
+        c02 = fma(a0.s0, b0.s2, c02);
+        c03 = fma(a0.s0, b0.s3, c03);
+
+        c10 = fma(a0.s1, b0.s0, c10);
+        c11 = fma(a0.s1, b0.s1, c11);
+        c12 = fma(a0.s1, b0.s2, c12);
+        c13 = fma(a0.s1, b0.s3, c13);
+
+        c20 = fma(a0.s2, b0.s0, c20);
+        c21 = fma(a0.s2, b0.s1, c21);
+        c22 = fma(a0.s2, b0.s2, c22);
+        c23 = fma(a0.s2, b0.s3, c23);
+
+        c30 = fma(a0.s3, b0.s0, c30);
+        c31 = fma(a0.s3, b0.s1, c31);
+        c32 = fma(a0.s3, b0.s2, c32);
+        c33 = fma(a0.s3, b0.s3, c33);
+    }
+
+    for(; src_addr_b < src_end_addr_b; src_addr_a += 4, src_addr_b += 4)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
+
+        c00 = fma(a0.s0, b0.s0, c00);
+        c01 = fma(a0.s0, b0.s1, c01);
+        c02 = fma(a0.s0, b0.s2, c02);
+        c03 = fma(a0.s0, b0.s3, c03);
+
+        c10 = fma(a0.s1, b0.s0, c10);
+        c11 = fma(a0.s1, b0.s1, c11);
+        c12 = fma(a0.s1, b0.s2, c12);
+        c13 = fma(a0.s1, b0.s3, c13);
+
+        c20 = fma(a0.s2, b0.s0, c20);
+        c21 = fma(a0.s2, b0.s1, c21);
+        c22 = fma(a0.s2, b0.s2, c22);
+        c23 = fma(a0.s2, b0.s3, c23);
+
+        c30 = fma(a0.s3, b0.s0, c30);
+        c31 = fma(a0.s3, b0.s1, c31);
+        c32 = fma(a0.s3, b0.s2, c32);
+        c33 = fma(a0.s3, b0.s3, c33);
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Multiply by the weight of matrix product
+    c00 = c00 * ALPHA;
+    c01 = c01 * ALPHA;
+    c02 = c02 * ALPHA;
+    c03 = c03 * ALPHA;
+    c10 = c10 * ALPHA;
+    c11 = c11 * ALPHA;
+    c12 = c12 * ALPHA;
+    c13 = c13 * ALPHA;
+    c20 = c20 * ALPHA;
+    c21 = c21 * ALPHA;
+    c22 = c22 * ALPHA;
+    c23 = c23 * ALPHA;
+    c30 = c30 * ALPHA;
+    c31 = c31 * ALPHA;
+    c32 = c32 * ALPHA;
+    c33 = c33 * ALPHA;
+
+    barrier(CLK_GLOBAL_MEM_FENCE);
+
+    // Store 4x4 block
+    vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(offset(&dst, 0, 0)));
+    vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(offset(&dst, 0, 1)));
+    vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(offset(&dst, 0, 2)));
+    vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(offset(&dst, 0, 3)));
+}
+
 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f16 and @ref gemm_transpose1x8_f16 before running the matrix multiplication
  *
@@ -607,6 +817,7 @@
     vstore8(c30, 0, (__global half *)(offset(&dst, 0, 3)));
 }
 
+#if(defined WIDTH_VECTOR_A)
 /** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)
  *
  * @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA
@@ -632,7 +843,6 @@
  * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
  */
-#if(defined WIDTH_VECTOR_A)
 __kernel void gemm_vm_f32(IMAGE_DECLARATION(src0),
                           IMAGE_DECLARATION(src1),
                           IMAGE_DECLARATION(dst))
@@ -746,6 +956,7 @@
 #endif /* (defined WIDTH_VECTOR_A) */
 #endif /* (defined WIDTH_MATRIX_B && defined ALPHA) */
 
+#if(defined BETA)
 /** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
  *
  * @attention The beta's value need to be passed at compile time using -DBETA
@@ -763,7 +974,6 @@
  * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
  */
-#if(defined BETA)
 __kernel void gemm_ma_f32(IMAGE_DECLARATION(src),
                           IMAGE_DECLARATION(dst))
 {
@@ -819,3 +1029,71 @@
     vstore8(out, 0, (__global half *)dst.ptr);
 }
 #endif /* (defined BETA) */
+
+#if(defined WIDTH_VECTOR_A)
+/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer
+ *
+ * @attention The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
+ *
+ * @attention The input A and matrix B must not be reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F32
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),
+                             TENSOR3D_DECLARATION(src1),
+                             IMAGE_DECLARATION(dst))
+{
+    int idx = get_global_id(0) * 4;
+    int idy = get_global_id(1);
+
+    /* Compute the address for the vector A and matrix B */
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));
+    src_addr.s1 += idx * sizeof(float);
+
+    int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
+
+    float4 acc = 0.0f;
+
+    for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
+    {
+        float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+        acc += b0 * (float4)a0.s0;
+        acc += b1 * (float4)a0.s1;
+    }
+
+    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
+    {
+        float  a0 = *((__global float *)(src0_ptr + src_addr.s0));
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+
+        acc += b0 * (float4)a0;
+    }
+
+    /* Compute destination address */
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
+}
+#endif /* (defined WIDTH_VECTOR_A) */
diff --git a/src/core/CL/cl_kernels/hog.cl b/src/core/CL/cl_kernels/hog.cl
new file mode 100644
index 0000000..31dd57b
--- /dev/null
+++ b/src/core/CL/cl_kernels/hog.cl
@@ -0,0 +1,455 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "types.h"
+
+#if(defined CELL_WIDTH && defined CELL_HEIGHT && defined NUM_BINS && defined PHASE_SCALE)
+
+/** This OpenCL kernel computes the HOG orientation binning
+ *
+ * @attention The following variables must be passed at compile time:
+ *
+ * -# -DCELL_WIDTH = Width of the cell
+ * -# -DCELL_HEIGHT = height of the cell
+ * -# -DNUM_BINS = Number of bins for each cell
+ * -# -DPHASE_SCALE = Scale factor used to evaluate the index of the local HOG
+ *
+ * @note Each work-item computes a single cell
+ *
+ * @param[in]  mag_ptr                             Pointer to the source image which stores the magnitude of the gradient for each pixel. Supported data types: S16
+ * @param[in]  mag_stride_x                        Stride of the magnitude image in X dimension (in bytes)
+ * @param[in]  mag_step_x                          mag_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mag_stride_y                        Stride of the magnitude image in Y dimension (in bytes)
+ * @param[in]  mag_step_y                          mag_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  mag_offset_first_element_in_bytes   The offset of the first element in the magnitude image
+ * @param[in]  phase_ptr                           Pointer to the source image which stores the phase of the gradient for each pixel. Supported data types: U8
+ * @param[in]  phase_stride_x                      Stride of the phase image in X dimension (in bytes)
+ * @param[in]  phase_step_x                        phase_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  phase_stride_y                      Stride of the the phase image in Y dimension (in bytes)
+ * @param[in]  phase_step_y                        phase_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  phase_offset_first_element_in_bytes The offset of the first element in the the phase image
+ * @param[out] dst_ptr                             Pointer to the destination image which stores the local HOG for each cell Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
+ * @param[in]  dst_stride_x                        Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                          dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                        Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                          dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes   The offset of the first element in the destination image
+ */
+__kernel void hog_orientation_binning(IMAGE_DECLARATION(mag),
+                                      IMAGE_DECLARATION(phase),
+                                      IMAGE_DECLARATION(dst))
+{
+    float bins[NUM_BINS] = { 0 };
+
+    // Compute address for the magnitude and phase images
+    Image mag   = CONVERT_TO_IMAGE_STRUCT(mag);
+    Image phase = CONVERT_TO_IMAGE_STRUCT(phase);
+
+    __global uchar *mag_row_ptr   = mag.ptr;
+    __global uchar *phase_row_ptr = phase.ptr;
+
+    for(int yc = 0; yc < CELL_HEIGHT; ++yc)
+    {
+        int xc = 0;
+        for(; xc <= (CELL_WIDTH - 4); xc += 4)
+        {
+            // Load magnitude and phase values
+            const float4 mag_f32   = convert_float4(vload4(0, (__global short *)mag_row_ptr + xc));
+            float4       phase_f32 = convert_float4(vload4(0, phase_row_ptr + xc));
+
+            // Scale phase: phase * scale + 0.5f
+            phase_f32 = (float4)0.5f + phase_f32 * (float4)PHASE_SCALE;
+
+            // Compute histogram index.
+            int4 hidx_s32 = convert_int4(phase_f32);
+
+            // Compute magnitude weights (w0 and w1)
+            const float4 hidx_f32 = convert_float4(hidx_s32);
+
+            // w1 = phase_f32 - hidx_s32
+            const float4 w1_f32 = phase_f32 - hidx_f32;
+
+            // w0 = 1.0 - w1
+            const float4 w0_f32 = (float4)1.0f - w1_f32;
+
+            // Calculate the weights for splitting vote
+            const float4 mag_w0_f32 = mag_f32 * w0_f32;
+            const float4 mag_w1_f32 = mag_f32 * w1_f32;
+
+            // Weighted vote between 2 bins
+
+            // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
+            hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
+
+            // Bin 0
+            bins[hidx_s32.s0] += mag_w0_f32.s0;
+            bins[hidx_s32.s1] += mag_w0_f32.s1;
+            bins[hidx_s32.s2] += mag_w0_f32.s2;
+            bins[hidx_s32.s3] += mag_w0_f32.s3;
+
+            hidx_s32 += (int4)1;
+
+            // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
+            hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
+
+            // Bin1
+            bins[hidx_s32.s0] += mag_w1_f32.s0;
+            bins[hidx_s32.s1] += mag_w1_f32.s1;
+            bins[hidx_s32.s2] += mag_w1_f32.s2;
+            bins[hidx_s32.s3] += mag_w1_f32.s3;
+        }
+
+        // Left over computation
+        for(; xc < CELL_WIDTH; xc++)
+        {
+            const float mag_value   = *((__global short *)mag_row_ptr + xc);
+            const float phase_value = *(mag_row_ptr + xc) * (float)PHASE_SCALE + 0.5f;
+            const float w1          = phase_value - floor(phase_value);
+
+            // The quantised phase is the histogram index [0, NUM_BINS - 1]
+            // Check limit of histogram index. If hidx == NUM_BINS, hidx = 0
+            const uint hidx = (uint)(phase_value) % NUM_BINS;
+
+            // Weighted vote between 2 bins
+            bins[hidx] += mag_value * (1.0f - w1);
+            bins[(hidx + 1) % NUM_BINS] += mag_value * w1;
+        }
+
+        // Point to the next row of magnitude and phase images
+        mag_row_ptr += mag_stride_y;
+        phase_row_ptr += phase_stride_y;
+    }
+
+    // Compute address for the destination image
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Store the local HOG in the global memory
+    int xc = 0;
+    for(; xc <= (NUM_BINS - 4); xc += 4)
+    {
+        float4 values = vload4(0, bins + xc);
+
+        vstore4(values, 0, ((__global float *)dst.ptr) + xc);
+    }
+
+    // Left over stores
+    for(; xc < NUM_BINS; ++xc)
+    {
+        ((__global float *)dst.ptr)[xc] = bins[xc];
+    }
+}
+#endif // (defined CELL_WIDTH && defined CELL_HEIGHT && defined NUM_BINS && defined PHASE_SCALE)
+
+#if(defined NUM_CELLS_PER_BLOCK_HEIGHT && defined NUM_BINS_PER_BLOCK_X && defined NUM_BINS_PER_BLOCK && HOG_NORM_TYPE && defined L2_HYST_THRESHOLD)
+
+#ifndef L2_NORM
+#error The value of enum class HOGNormType::L2_NORM has not be passed to the OpenCL kernel
+#endif
+
+#ifndef L2HYS_NORM
+#error The value of enum class HOGNormType::L2HYS_NORM has not be passed to the OpenCL kernel
+#endif
+
+#ifndef L1_NORM
+#error The value of enum class HOGNormType::L1_NORM has not be passed to the OpenCL kernel
+#endif
+
+/** This OpenCL kernel computes the HOG block normalization
+ *
+ * @attention The following variables must be passed at compile time:
+ *
+ * -# -DNUM_CELLS_PER_BLOCK_HEIGHT = Number of cells for each block
+ * -# -DNUM_BINS_PER_BLOCK_X = Number of bins for each block along the X direction
+ * -# -DNUM_BINS_PER_BLOCK = Number of bins for each block
+ * -# -DHOG_NORM_TYPE = Normalization type
+ * -# -DL2_HYST_THRESHOLD = Threshold used for L2HYS_NORM normalization method
+ * -# -DL2_NORM = Value of the enum class HOGNormType::L2_NORM
+ * -# -DL2HYS_NORM = Value of the enum class HOGNormType::L2HYS_NORM
+ * -# -DL1_NORM = Value of the enum class HOGNormType::L1_NORM
+ *
+ * @note Each work-item computes a single block
+ *
+ * @param[in]  src_ptr                           Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image which stores the normlized HOG Supported data types: F32. Number of channels supported: equal to the number of histogram bins per block
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void hog_block_normalization(IMAGE_DECLARATION(src),
+                                      IMAGE_DECLARATION(dst))
+{
+    float  sum     = 0.0f;
+    float4 sum_f32 = (float4)(0.0f);
+
+    // Compute address for the source and destination tensor
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    for(size_t yc = 0; yc < NUM_CELLS_PER_BLOCK_HEIGHT; ++yc)
+    {
+        const __global float *hist_ptr = (__global float *)(src.ptr + yc * src_stride_y);
+
+        int xc = 0;
+        for(; xc <= (NUM_BINS_PER_BLOCK_X - 16); xc += 16)
+        {
+            const float4 val0 = vload4(0, hist_ptr + xc + 0);
+            const float4 val1 = vload4(0, hist_ptr + xc + 4);
+            const float4 val2 = vload4(0, hist_ptr + xc + 8);
+            const float4 val3 = vload4(0, hist_ptr + xc + 12);
+
+#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+            // Compute val^2 for L2_NORM or L2HYS_NORM
+            sum_f32 += val0 * val0;
+            sum_f32 += val1 * val1;
+            sum_f32 += val2 * val2;
+            sum_f32 += val3 * val3;
+#else
+            // Compute |val| for L1_NORM
+            sum_f32 += fabs(val0);
+            sum_f32 += fabs(val1);
+            sum_f32 += fabs(val2);
+            sum_f32 += fabs(val3);
+#endif // (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+
+            // Store linearly the input values un-normalized in the output image. These values will be reused for the normalization.
+            // This approach will help us to be cache friendly in the next for loop where the normalization will be done because all the values
+            // will be accessed consecutively
+            vstore4(val0, 0, ((__global float *)dst.ptr) + xc + 0 + yc * NUM_BINS_PER_BLOCK_X);
+            vstore4(val1, 0, ((__global float *)dst.ptr) + xc + 4 + yc * NUM_BINS_PER_BLOCK_X);
+            vstore4(val2, 0, ((__global float *)dst.ptr) + xc + 8 + yc * NUM_BINS_PER_BLOCK_X);
+            vstore4(val3, 0, ((__global float *)dst.ptr) + xc + 12 + yc * NUM_BINS_PER_BLOCK_X);
+        }
+
+        // Compute left over
+        for(; xc < NUM_BINS_PER_BLOCK_X; ++xc)
+        {
+            const float val = hist_ptr[xc];
+
+#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+            sum += val * val;
+#else
+            sum += fabs(val);
+#endif // (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+
+            ((__global float *)dst.ptr)[xc + 0 + yc * NUM_BINS_PER_BLOCK_X] = val;
+        }
+    }
+
+    sum += dot(sum_f32, (float4)1.0f);
+
+    float scale = 1.0f / (sqrt(sum) + NUM_BINS_PER_BLOCK * 0.1f);
+
+#if(HOG_NORM_TYPE == L2HYS_NORM)
+    // Reset sum
+    sum_f32 = (float4)0.0f;
+    sum     = 0.0f;
+
+    int k = 0;
+    for(; k <= NUM_BINS_PER_BLOCK - 16; k += 16)
+    {
+        float4 val0 = vload4(0, ((__global float *)dst.ptr) + k + 0);
+        float4 val1 = vload4(0, ((__global float *)dst.ptr) + k + 4);
+        float4 val2 = vload4(0, ((__global float *)dst.ptr) + k + 8);
+        float4 val3 = vload4(0, ((__global float *)dst.ptr) + k + 12);
+
+        // Scale val
+        val0 = val0 * (float4)scale;
+        val1 = val1 * (float4)scale;
+        val2 = val2 * (float4)scale;
+        val3 = val3 * (float4)scale;
+
+        // Clip val if over _threshold_l2hys
+        val0 = fmin(val0, (float4)L2_HYST_THRESHOLD);
+        val1 = fmin(val1, (float4)L2_HYST_THRESHOLD);
+        val2 = fmin(val2, (float4)L2_HYST_THRESHOLD);
+        val3 = fmin(val3, (float4)L2_HYST_THRESHOLD);
+
+        // Compute val^2
+        sum_f32 += val0 * val0;
+        sum_f32 += val1 * val1;
+        sum_f32 += val2 * val2;
+        sum_f32 += val3 * val3;
+
+        vstore4(val0, 0, ((__global float *)dst.ptr) + k + 0);
+        vstore4(val1, 0, ((__global float *)dst.ptr) + k + 4);
+        vstore4(val2, 0, ((__global float *)dst.ptr) + k + 8);
+        vstore4(val3, 0, ((__global float *)dst.ptr) + k + 12);
+    }
+
+    // Compute left over
+    for(; k < NUM_BINS_PER_BLOCK; ++k)
+    {
+        float val = ((__global float *)dst.ptr)[k] * scale;
+
+        // Clip scaled input_value if over L2_HYST_THRESHOLD
+        val = fmin(val, (float)L2_HYST_THRESHOLD);
+
+        sum += val * val;
+
+        ((__global float *)dst.ptr)[k] = val;
+    }
+
+    sum += dot(sum_f32, (float4)1.0f);
+
+    // We use the same constants of OpenCV
+    scale = 1.0f / (sqrt(sum) + 1e-3f);
+
+#endif // (HOG_NORM_TYPE == L2HYS_NORM)
+
+    int i = 0;
+    for(; i <= (NUM_BINS_PER_BLOCK - 16); i += 16)
+    {
+        float4 val0 = vload4(0, ((__global float *)dst.ptr) + i + 0);
+        float4 val1 = vload4(0, ((__global float *)dst.ptr) + i + 4);
+        float4 val2 = vload4(0, ((__global float *)dst.ptr) + i + 8);
+        float4 val3 = vload4(0, ((__global float *)dst.ptr) + i + 12);
+
+        // Multiply val by the normalization scale factor
+        val0 = val0 * (float4)scale;
+        val1 = val1 * (float4)scale;
+        val2 = val2 * (float4)scale;
+        val3 = val3 * (float4)scale;
+
+        vstore4(val0, 0, ((__global float *)dst.ptr) + i + 0);
+        vstore4(val1, 0, ((__global float *)dst.ptr) + i + 4);
+        vstore4(val2, 0, ((__global float *)dst.ptr) + i + 8);
+        vstore4(val3, 0, ((__global float *)dst.ptr) + i + 12);
+    }
+
+    for(; i < NUM_BINS_PER_BLOCK; ++i)
+    {
+        ((__global float *)dst.ptr)[i] *= scale;
+    }
+}
+#endif // (defined NUM_CELLS_PER_BLOCK_HEIGHT && defined NUM_BINS_PER_BLOCK_X && defined NUM_BINS_PER_BLOCK && HOG_NORM_TYPE && defined L2_HYST_THRESHOLD)
+
+#if(defined NUM_BLOCKS_PER_DESCRIPTOR_Y && defined NUM_BINS_PER_DESCRIPTOR_X && defined THRESHOLD && defined MAX_NUM_DETECTION_WINDOWS && defined IDX_CLASS && defined BLOCK_STRIDE_WIDTH && defined BLOCK_STRIDE_HEIGHT && defined DETECTION_WINDOW_WIDTH && defined DETECTION_WINDOW_HEIGHT)
+
+/** This OpenCL kernel computes the HOG detector using linear SVM
+ *
+ * @attention The following variables must be passed at compile time:
+ *
+ * -# -DNUM_BLOCKS_PER_DESCRIPTOR_Y = Number of blocks per descriptor along the Y direction
+ * -# -DNUM_BINS_PER_DESCRIPTOR_X = Number of bins per descriptor along the X direction
+ * -# -DTHRESHOLD = Threshold for the distance between features and SVM classifying plane
+ * -# -DMAX_NUM_DETECTION_WINDOWS = Maximum number of possible detection windows. It is equal to the size of the DetectioWindow array
+ * -# -DIDX_CLASS = Index of the class to detect
+ * -# -DBLOCK_STRIDE_WIDTH = Block stride for the X direction
+ * -# -DBLOCK_STRIDE_HEIGHT = Block stride for the Y direction
+ * -# -DDETECTION_WINDOW_WIDTH = Width of the detection window
+ * -# -DDETECTION_WINDOW_HEIGHT = Height of the detection window
+ *
+ * @note Each work-item computes a single detection window
+ *
+ * @param[in]  src_ptr                           Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  hog_descriptor                    Pointer to HOG descriptor. Supported data types: F32
+ * @param[out] dst                               Pointer to DetectionWindow array
+ * @param[out] num_detection_windows             Number of objects detected
+ */
+__kernel void hog_detector(IMAGE_DECLARATION(src),
+                           __global float *hog_descriptor,
+                           __global DetectionWindow *dst,
+                           __global uint *num_detection_windows)
+{
+    // Check if the DetectionWindow array is full
+    if(*num_detection_windows >= MAX_NUM_DETECTION_WINDOWS)
+    {
+        return;
+    }
+
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    const int src_step_y_f32 = src_stride_y / sizeof(float);
+
+    // Init score_f32 with 0
+    float4 score_f32 = (float4)0.0f;
+
+    // Init score with 0
+    float score = 0.0f;
+
+    __global float *src_row_ptr = (__global float *)src.ptr;
+
+    // Compute Linear SVM
+    for(int yb = 0; yb < NUM_BLOCKS_PER_DESCRIPTOR_Y; ++yb, src_row_ptr += src_step_y_f32)
+    {
+        int xb = 0;
+
+        const int offset_y = yb * NUM_BINS_PER_DESCRIPTOR_X;
+
+        for(; xb < (int)NUM_BINS_PER_DESCRIPTOR_X - 8; xb += 8)
+        {
+            // Load descriptor values
+            float4 a0_f32 = vload4(0, src_row_ptr + xb + 0);
+            float4 a1_f32 = vload4(0, src_row_ptr + xb + 4);
+
+            float4 b0_f32 = vload4(0, hog_descriptor + xb + 0 + offset_y);
+            float4 b1_f32 = vload4(0, hog_descriptor + xb + 4 + offset_y);
+
+            // Multiply accumulate
+            score_f32 += a0_f32 * b0_f32;
+            score_f32 += a1_f32 * b1_f32;
+        }
+
+        for(; xb < NUM_BINS_PER_DESCRIPTOR_X; ++xb)
+        {
+            const float a = src_row_ptr[xb];
+            const float b = hog_descriptor[xb + offset_y];
+
+            score += a * b;
+        }
+    }
+
+    score += dot(score_f32, (float4)1.0f);
+
+    // Add the bias. The bias is located at the position (descriptor_size() - 1)
+    // (descriptor_size - 1) = NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y
+    score += hog_descriptor[NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y];
+
+    if(score > (float)THRESHOLD)
+    {
+        int id = atomic_inc(num_detection_windows);
+        if(id < MAX_NUM_DETECTION_WINDOWS)
+        {
+            dst[id].x         = get_global_id(0) * BLOCK_STRIDE_WIDTH;
+            dst[id].y         = get_global_id(1) * BLOCK_STRIDE_HEIGHT;
+            dst[id].width     = DETECTION_WINDOW_WIDTH;
+            dst[id].height    = DETECTION_WINDOW_HEIGHT;
+            dst[id].idx_class = IDX_CLASS;
+            dst[id].score     = score;
+        }
+    }
+}
+#endif // defined BIAS && defined NUM_BLOCKS_PER_DESCRIPTOR_Y && defined NUM_BINS_PER_DESCRIPTOR_X && ...
diff --git a/src/core/CL/cl_kernels/magnitude_phase.cl b/src/core/CL/cl_kernels/magnitude_phase.cl
index 015eeae..c4b0df8 100644
--- a/src/core/CL/cl_kernels/magnitude_phase.cl
+++ b/src/core/CL/cl_kernels/magnitude_phase.cl
@@ -57,10 +57,9 @@
  */
 inline uchar16 phase_unsigned(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b)
 {
-    float16 arct = atan2pi(convert_float16(b), convert_float16(a));
-    arct         = select(arct, arct + 2, arct < 0.0f);
-
-    return convert_uchar16(convert_int16(mad(arct, 90, 0.5f)) & 0xFFu);
+    float16 angle_deg_f32 = atan2pi(convert_float16(b), convert_float16(a)) * (float16)180.0f;
+    angle_deg_f32         = select(angle_deg_f32, (float16)180.0f + angle_deg_f32, angle_deg_f32 < (float16)0.0f);
+    return convert_uchar16(angle_deg_f32);
 }
 
 /** Calculates signed phase between two inputs.
diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/normalization_layer.cl
index a913023..076b0d8 100644
--- a/src/core/CL/cl_kernels/normalization_layer.cl
+++ b/src/core/CL/cl_kernels/normalization_layer.cl
@@ -121,13 +121,13 @@
  * @param[in]  kappa                                       Kappa parameter in the normalization equation
  * @param[in]  radius                                      Number of elements on the right or left side to normalize across
  */
-__kernel void normalization_layer_in_map(TENSOR3D_DECLARATION(input),
-                                         TENSOR3D_DECLARATION(squared_input),
-                                         TENSOR3D_DECLARATION(output),
-                                         float coeff,
-                                         float beta,
-                                         float kappa,
-                                         uint  radius)
+__kernel void normalization_layer_in_map_1D(TENSOR3D_DECLARATION(input),
+                                            TENSOR3D_DECLARATION(squared_input),
+                                            TENSOR3D_DECLARATION(output),
+                                            float coeff,
+                                            float beta,
+                                            float kappa,
+                                            uint  radius)
 {
     Tensor3D in         = CONVERT_TO_TENSOR3D_STRUCT(input);
     Tensor3D squared_in = CONVERT_TO_TENSOR3D_STRUCT(squared_input);
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 89117cf..1902df9 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -50,20 +50,24 @@
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
  * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
  * @param[out] output_ptr                           Pointer to the destination image. Supported data types: F16, F32
  * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
  * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
  * @param[in]  max_dims                             The maximum index that can be accessed in x and y dimension (width + pad)
  * @param[in]  strides                              The pooling operation strides in each dimension
  * @param[in]  paddings                             The pooling operation paddings in each dimension
  */
 __kernel void pooling_layer_2(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output)
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output)
 #ifdef POOL_AVG
     ,
     int2 max_dims, int2 strides, int2 paddings
@@ -71,14 +75,14 @@
 )
 {
     // Get pixels pointer
-    Image input  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image output = CONVERT_TO_IMAGE_STRUCT(output);
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 
     // Load data
     VEC_DATA_TYPE(DATA_TYPE, 2)
-    data0 = vload2(0, (__global DATA_TYPE *)offset(&input, 0, 0));
+    data0 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
     VEC_DATA_TYPE(DATA_TYPE, 2)
-    data1 = vload2(0, (__global DATA_TYPE *)offset(&input, 0, 1));
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
 
     // Perform calculations
     data0         = POOL_OP(data0, data1);
@@ -104,20 +108,24 @@
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
  * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
  * @param[out] output_ptr                           Pointer to the destination image. Supported data types: F16, F32
  * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
  * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
  * @param[in]  max_dims                             The maximum index that can be accessed in x and y dimension (width + pad)
  * @param[in]  strides                              The pooling operation strides in each dimension
  * @param[in]  paddings                             The pooling operation paddings in each dimension
  */
 __kernel void pooling_layer_3(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output)
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output)
 #ifdef POOL_AVG
     ,
     int2 max_dims, int2 strides, int2 paddings
@@ -125,16 +133,16 @@
 )
 {
     // Get pixels pointer
-    Image input  = CONVERT_TO_IMAGE_STRUCT(input);
-    Image output = CONVERT_TO_IMAGE_STRUCT(output);
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 
     // Load data
     VEC_DATA_TYPE(DATA_TYPE, 3)
-    data0 = vload3(0, (__global DATA_TYPE *)offset(&input, 0, 0));
+    data0 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
     VEC_DATA_TYPE(DATA_TYPE, 3)
-    data1 = vload3(0, (__global DATA_TYPE *)offset(&input, 0, 1));
+    data1 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
     VEC_DATA_TYPE(DATA_TYPE, 3)
-    data2 = vload3(0, (__global DATA_TYPE *)offset(&input, 0, 2));
+    data2 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
 
     // Perform calculations
     data0         = POOL_OP(data0, data1);
diff --git a/src/core/CL/cl_kernels/types.h b/src/core/CL/cl_kernels/types.h
index 9303be8..8773646 100644
--- a/src/core/CL/cl_kernels/types.h
+++ b/src/core/CL/cl_kernels/types.h
@@ -43,4 +43,14 @@
     float error;           /**< A tracking method specific error. Initialized to 0 by corner detectors. */
 } Keypoint;
 
+/** Detection window struct */
+typedef struct DetectionWindow
+{
+    ushort x;         /**< Top-left x coordinate */
+    ushort y;         /**< Top-left y coordinate */
+    ushort width;     /**< Width of the detection window */
+    ushort height;    /**< Height of the detection window */
+    ushort idx_class; /**< Index of the class */
+    float  score;     /**< Confidence value for the detection window */
+} DetectionWindow;
 #endif // ARM_COMPUTE_TYPES_H
diff --git a/src/core/CL/cl_kernels/warp_helpers.h b/src/core/CL/cl_kernels/warp_helpers.h
index 8228443..26a8b85 100644
--- a/src/core/CL/cl_kernels/warp_helpers.h
+++ b/src/core/CL/cl_kernels/warp_helpers.h
@@ -32,8 +32,8 @@
  */
 inline const float8 clamp_to_border(float8 coords, const float width, const float height)
 {
-    const float4 clamped_x = clamp(coords.even, -1.0, width);
-    const float4 clamped_y = clamp(coords.odd, -1.0, height);
+    const float4 clamped_x = clamp(coords.even, -1.0f, width);
+    const float4 clamped_y = clamp(coords.odd, -1.0f, height);
     return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
 }
 
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index 3e4abaa..83bbe6a 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
@@ -37,9 +37,15 @@
 
 void CLActivationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
 
     // Set build options
     std::set<std::string> build_opts;
@@ -54,5 +60,5 @@
 
     // Make sure _kernel is initialized before calling the parent's configure
     constexpr unsigned int num_elems_processed_per_iteration = 16;
-    ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+    ICLSimple3DKernel::configure(input, output, num_elems_processed_per_iteration);
 }
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
new file mode 100644
index 0000000..309a153
--- /dev/null
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel()
+    : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0)
+{
+}
+
+void CLBatchNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
+                                                float epsilon)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mean, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(var, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(beta, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gamma, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != mean->info()->dimension(0));
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+
+    _input   = input;
+    _output  = output;
+    _mean    = mean;
+    _var     = var;
+    _beta    = beta;
+    _gamma   = gamma;
+    _epsilon = epsilon;
+
+    // Create kernel
+    std::string kernel_name = "batchnormalization_layer";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Set kernel static arguments
+    unsigned int idx = 2 * num_arguments_per_3D_tensor() + 4 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
+    _kernel.setArg<cl_float>(idx++, _epsilon);
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = 4;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+
+    Window vector_slice = window.first_slice_window_1D();
+    vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    unsigned int idx = 2 * num_arguments_per_3D_tensor();
+    add_1D_tensor_argument(idx, _mean, vector_slice);
+    add_1D_tensor_argument(idx, _var, vector_slice);
+    add_1D_tensor_argument(idx, _beta, vector_slice);
+    add_1D_tensor_argument(idx, _gamma, vector_slice);
+
+    do
+    {
+        idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLConvolutionLayerWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLConvolutionLayerWeightsReshapeKernel.cpp
deleted file mode 100644
index 738ea31..0000000
--- a/src/core/CL/kernels/CLConvolutionLayerWeightsReshapeKernel.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLConvolutionLayerWeightsReshapeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-
-using namespace arm_compute;
-
-CLConvolutionLayerWeightsReshapeKernel::CLConvolutionLayerWeightsReshapeKernel()
-    : _input(nullptr), _biases(nullptr), _output(nullptr)
-{
-}
-
-void CLConvolutionLayerWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 4);
-    ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 2);
-
-    // Check biases
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
-    }
-
-    _biases = biases;
-    _output = output;
-    _input  = input;
-
-    // Create build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    build_opts.emplace(((biases != nullptr) ? "-DHAS_BIAS" : ""));
-
-    // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reshape_to_columns", build_opts));
-
-    // Configure window
-    Window win = calculate_max_window(*input->info(), Steps());
-    // The CLConvolutionLayerWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-    ICLKernel::configure(win);
-}
-
-void CLConvolutionLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-    Window out_window;
-    out_window.use_tensor_dimensions(_output->info());
-
-    Window out_slice = out_window.first_slice_window_2D();
-    Window in_slice  = window.first_slice_window_3D();
-
-    Window biases_slice;
-    if(_biases != nullptr)
-    {
-        biases_slice.set(Window::DimX, Window::Dimension(0, _biases->info()->tensor_shape().x(), 1));
-    }
-    unsigned int increment_biases = 0;
-    unsigned int increment_output = 0;
-
-    // Run kernel
-    do
-    {
-        // Set arguments
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, in_slice);
-        add_2D_tensor_argument(idx, _output, out_slice);
-
-        if(_biases != nullptr)
-        {
-            add_2D_tensor_argument(idx, _biases, biases_slice);
-            biases_slice.set(Window::DimX, Window::Dimension(++increment_biases, _biases->info()->dimension(0), 1));
-        }
-
-        _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0));
-        _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1));
-        enqueue(queue, *this, in_slice);
-        out_slice.set(Window::DimX, Window::Dimension(++increment_output, _output->info()->dimension(0), 1));
-
-    }
-    while(window.slide_window_slice_3D(in_slice));
-}
diff --git a/src/core/CL/kernels/CLDepthConcatenateKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp
new file mode 100644
index 0000000..73f1ba1
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLDepthConcatenateKernel::CLDepthConcatenateKernel()
+    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0)
+{
+}
+
+BorderSize CLDepthConcatenateKernel::border_size() const
+{
+    return BorderSize(_top_bottom, _left_right);
+}
+
+void CLDepthConcatenateKernel::configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
+
+    // The gaps between the two lowest dimensions of input and output need to be divisible by 2
+    // Otherwise it is not clear how the padding should be added onto the input tensor
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_depth"));
+
+    // Configure kernel window
+    _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
+    _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
+
+    const unsigned int offset_to_first_elements_in_bytes = depth_offset * output->info()->strides_in_bytes()[2] + _left_right * output->info()->strides_in_bytes()[0] + _top_bottom *
+                                                           output->info()->strides_in_bytes()[1];
+
+    const unsigned int num_elems_processed_per_iteration = 4;
+    const unsigned int num_elems_read_per_iteration      = 4;
+    const unsigned int num_rows_read_per_iteration       = 1;
+
+    // The window needs to be based on input as we copy all the depths of input
+    Window win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_processed_per_iteration), border_size());
+
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); // Skip the input and output parameters
+    _kernel.setArg<unsigned int>(idx, offset_to_first_elements_in_bytes);
+
+    ICLKernel::configure(win);
+}
+
+void CLDepthConcatenateKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index 5bbc3ef..981aad6 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -63,6 +63,8 @@
     ARM_COMPUTE_ERROR_ON(tensor == nullptr);
     ARM_COMPUTE_ERROR_ON(tensor->info()->num_channels() != 1);
 
+    border_size.limit(tensor->info()->padding());
+
     // If there is no border: early exit
     if(border_size.empty() || border_mode == BorderMode::UNDEFINED)
     {
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 522674d..d7388e8 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -24,6 +24,8 @@
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
+
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -106,7 +108,16 @@
 
         // Create kernel
         std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
-        _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_mm_" + data_type_name), build_opts));
+
+        if(data_type_name == "f32")
+        {
+            GPUTarget arch_target = get_arch_from_target(get_target());
+            _kernel               = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_f32_" + string_from_target(arch_target), build_opts));
+        }
+        else
+        {
+            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_" + data_type_name, build_opts));
+        }
 
         // Configure window kernel
         const unsigned int     num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
@@ -115,7 +126,7 @@
         Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
         AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
-        AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+        AccessWindowTranspose input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
         AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
 
         update_window_and_padding(win, input0_access, input1_access, output_access);
diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
index 30195d9..ecee1ab 100644
--- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
@@ -41,11 +41,18 @@
 void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    TensorShape  output_shape{ input->info()->tensor_shape() };
+    const size_t transpose_w = 16 / input->info()->element_size();
+    output_shape.set(0, input->info()->dimension(1) * transpose_w);
+    output_shape.set(1, static_cast<size_t>(std::ceil((input->info()->dimension(0) / static_cast<float>(transpose_w)))));
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) != std::ceil(static_cast<float>(input->info()->dimension(0)) / 16.0f)) && (input->info()->data_type() == DataType::U8));
-    ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) != std::ceil(static_cast<float>(input->info()->dimension(0)) / 8.0f)) && (input->info()->data_type() == DataType::F16));
-    ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) != std::ceil(static_cast<float>(input->info()->dimension(0)) / 4.0f)) && (input->info()->data_type() == DataType::F32));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
 
     _input                                               = input;
     _output                                              = output;
@@ -97,3 +104,26 @@
 
     ICLKernel::configure(win);
 }
+
+void CLGEMMTranspose1xWKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    // Output is transposed
+    Window out_window(window);
+    out_window.set(Window::DimX, window.y());
+    out_window.set(Window::DimY, window.x());
+
+    Window in_slice  = window.first_slice_window_2D();
+    Window out_slice = out_window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, in_slice);
+        add_2D_tensor_argument(idx, _output, out_slice);
+        enqueue(queue, *this, in_slice, _lws_hint);
+    }
+    while(window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+}
diff --git a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
new file mode 100644
index 0000000..87659c4
--- /dev/null
+++ b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLHOGOrientationBinningKernel::CLHOGOrientationBinningKernel()
+    : _input_magnitude(nullptr), _input_phase(nullptr), _output(nullptr), _cell_size()
+{
+}
+
+void CLHOGOrientationBinningKernel::configure(const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_magnitude, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_phase, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins(), DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimX) != input_phase->info()->dimension(Window::DimX));
+    ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimY) != input_phase->info()->dimension(Window::DimY));
+
+    _input_magnitude = input_magnitude;
+    _input_phase     = input_phase;
+    _output          = output;
+    _cell_size       = hog_info->cell_size();
+
+    float phase_scale = (PhaseType::SIGNED == hog_info->phase_type() ? hog_info->num_bins() / 360.0f : hog_info->num_bins() / 180.0f);
+    phase_scale *= (PhaseType::SIGNED == hog_info->phase_type() ? 360.0f / 255.0f : 1.0f);
+
+    std::stringstream args_str;
+    args_str << "-DCELL_WIDTH=" << hog_info->cell_size().width << " ";
+    args_str << "-DCELL_HEIGHT=" << hog_info->cell_size().height << " ";
+    args_str << "-DNUM_BINS=" << hog_info->num_bins() << " ";
+    args_str << "-DPHASE_SCALE=" << phase_scale << " ";
+
+    // Construct kernel name
+    std::set<std::string> build_opts = {};
+    build_opts.insert(args_str.str());
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_orientation_binning", build_opts));
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    constexpr unsigned int num_elems_read_per_iteration      = 1;
+    const unsigned int     num_rows_read_per_iteration       = hog_info->cell_size().height;
+    constexpr unsigned int num_elems_written_per_iteration   = 1;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input_magnitude->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              AccessWindowRectangle(input_phase->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLHOGOrientationBinningKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        // Compute slice for the magnitude and phase tensors
+        Window slice_mag_phase = window.first_slice_window_2D();
+        slice_mag_phase.set(Window::DimX, Window::Dimension(window.x().start() * _cell_size.width, window.x().start() * _cell_size.width, _cell_size.width));
+        slice_mag_phase.set(Window::DimY, Window::Dimension(window.y().start() * _cell_size.height, window.y().start() * _cell_size.height, _cell_size.height));
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input_magnitude, slice_mag_phase);
+        add_2D_tensor_argument(idx, _input_phase, slice_mag_phase);
+        add_2D_tensor_argument(idx, _output, slice);
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLHOGBlockNormalizationKernel::CLHOGBlockNormalizationKernel()
+    : _input(nullptr), _output(nullptr), _num_cells_per_block_stride()
+{
+}
+
+void CLHOGBlockNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info)
+{
+    ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, hog_info->num_bins(), DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
+
+    // Number of cells per block
+    const Size2D num_cells_per_block(hog_info->block_size().width / hog_info->cell_size().width,
+                                     hog_info->block_size().height / hog_info->cell_size().height);
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins() * num_cells_per_block.area(), DataType::F32);
+
+    // Number of cells per block stride
+    const Size2D num_cells_per_block_stride(hog_info->block_stride().width / hog_info->cell_size().width,
+                                            hog_info->block_stride().height / hog_info->cell_size().height);
+
+    _input                      = input;
+    _output                     = output;
+    _num_cells_per_block_stride = num_cells_per_block_stride;
+
+    std::stringstream args_str;
+    args_str << "-DL2_HYST_THRESHOLD=" << hog_info->l2_hyst_threshold() << " ";
+    args_str << "-DNUM_CELLS_PER_BLOCK_HEIGHT=" << num_cells_per_block.height << " ";
+    args_str << "-DNUM_BINS_PER_BLOCK_X=" << num_cells_per_block.width *hog_info->num_bins() << " ";
+    args_str << "-DNUM_BINS_PER_BLOCK=" << _output->info()->num_channels() << " ";
+    args_str << "-DL2_NORM=" << static_cast<int>(HOGNormType::L2_NORM) << " ";
+    args_str << "-DL1_NORM=" << static_cast<int>(HOGNormType::L1_NORM) << " ";
+    args_str << "-DL2HYS_NORM=" << static_cast<int>(HOGNormType::L2HYS_NORM) << " ";
+    args_str << "-DHOG_NORM_TYPE=" << static_cast<int>(hog_info->normalization_type()) << " ";
+
+    // Construct kernel name
+    std::set<std::string> build_opts = {};
+    build_opts.insert(args_str.str());
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_block_normalization", build_opts));
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    constexpr unsigned int num_elems_read_per_iteration      = 1;
+    const unsigned int     num_rows_read_per_iteration       = num_cells_per_block.height;
+    constexpr unsigned int num_elems_written_per_iteration   = 1;
+    const unsigned int     num_rows_written_per_iteration    = num_cells_per_block.height;
+
+    // Configure kernel window
+    Window                win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLHOGBlockNormalizationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        // Compute slice for the magnitude and phase tensors
+        Window slice_in = window.first_slice_window_2D();
+        slice_in.set_dimension_step(Window::DimX, _num_cells_per_block_stride.width);
+        slice_in.set_dimension_step(Window::DimY, _num_cells_per_block_stride.height);
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice_in);
+        add_2D_tensor_argument(idx, _output, slice);
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLHOGDetectorKernel.cpp b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
new file mode 100644
index 0000000..0f9a989
--- /dev/null
+++ b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLHOG.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLHOGDetectorKernel::CLHOGDetectorKernel()
+    : _input(nullptr), _detection_windows(), _num_detection_windows(nullptr)
+{
+}
+
+void CLHOGDetectorKernel::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows, const Size2D &detection_window_stride,
+                                    float threshold, uint16_t idx_class)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(hog == nullptr);
+    ARM_COMPUTE_ERROR_ON(detection_windows == nullptr);
+    ARM_COMPUTE_ERROR_ON(num_detection_windows == nullptr);
+    ARM_COMPUTE_ERROR_ON((detection_window_stride.width % hog->info()->block_stride().width) != 0);
+    ARM_COMPUTE_ERROR_ON((detection_window_stride.height % hog->info()->block_stride().height) != 0);
+
+    const Size2D &detection_window_size = hog->info()->detection_window_size();
+    const Size2D &block_size            = hog->info()->block_size();
+    const Size2D &block_stride          = hog->info()->block_stride();
+
+    _input                 = input;
+    _detection_windows     = detection_windows;
+    _num_detection_windows = num_detection_windows;
+
+    const unsigned int num_bins_per_descriptor_x   = ((detection_window_size.width - block_size.width) / block_stride.width + 1) * input->info()->num_channels();
+    const unsigned int num_blocks_per_descriptor_y = (detection_window_size.height - block_size.height) / block_stride.height + 1;
+
+    ARM_COMPUTE_ERROR_ON((num_bins_per_descriptor_x * num_blocks_per_descriptor_y + 1) != hog->info()->descriptor_size());
+
+    std::stringstream args_str;
+    args_str << "-DNUM_BLOCKS_PER_DESCRIPTOR_Y=" << num_blocks_per_descriptor_y << " ";
+    args_str << "-DNUM_BINS_PER_DESCRIPTOR_X=" << num_bins_per_descriptor_x << " ";
+    args_str << "-DTHRESHOLD=" << threshold << " ";
+    args_str << "-DMAX_NUM_DETECTION_WINDOWS=" << detection_windows->max_num_values() << " ";
+    args_str << "-DIDX_CLASS=" << idx_class << " ";
+    args_str << "-DBLOCK_STRIDE_WIDTH=" << block_stride.width << " ";
+    args_str << "-DBLOCK_STRIDE_HEIGHT=" << block_stride.height << " ";
+    args_str << "-DDETECTION_WINDOW_WIDTH=" << detection_window_size.width << " ";
+    args_str << "-DDETECTION_WINDOW_HEIGHT=" << detection_window_size.height << " ";
+
+    // Construct kernel name
+    std::set<std::string> build_opts = {};
+    build_opts.insert(args_str.str());
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_detector", build_opts));
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); // Skip the input parameters
+    _kernel.setArg(idx++, hog->cl_buffer());
+    _kernel.setArg(idx++, detection_windows->cl_buffer());
+    _kernel.setArg(idx++, *_num_detection_windows);
+
+    // Get the number of blocks along the x and y directions of the input tensor
+    const ValidRegion &valid_region = input->info()->valid_region();
+    const size_t       num_blocks_x = valid_region.shape[0];
+    const size_t       num_blocks_y = valid_region.shape[1];
+
+    // Get the number of blocks along the x and y directions of the detection window
+    const size_t num_blocks_per_detection_window_x = detection_window_size.width / block_stride.width;
+    const size_t num_blocks_per_detection_window_y = detection_window_size.height / block_stride.height;
+
+    const size_t window_step_x = detection_window_stride.width / block_stride.width;
+    const size_t window_step_y = detection_window_stride.height / block_stride.height;
+
+    // Configure kernel window
+    Window win;
+    win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x), window_step_x));
+    win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y), window_step_y));
+
+    constexpr unsigned int num_elems_read_per_iteration = 1;
+    const unsigned int     num_rows_read_per_iteration  = num_blocks_per_descriptor_y;
+
+    update_window_and_padding(win, AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration));
+
+    ICLKernel::configure(win);
+}
+
+void CLHOGDetectorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLIntegralImageKernel.cpp b/src/core/CL/kernels/CLIntegralImageKernel.cpp
index a04aea8..69ede45 100644
--- a/src/core/CL/kernels/CLIntegralImageKernel.cpp
+++ b/src/core/CL/kernels/CLIntegralImageKernel.cpp
@@ -41,11 +41,26 @@
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
 
+    _input  = input;
+    _output = output;
+
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("integral_horizontal"));
 
-    // Configure kernel
-    ICLSimple2DKernel::configure(input, output, input->info()->dimension(0));
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0);
+    const unsigned int num_elems_accessed_per_iteration  = ceil_to_multiple(num_elems_processed_per_iteration, 16);
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_accessed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_accessed_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
 }
 
 CLIntegralImageVertKernel::CLIntegralImageVertKernel()
diff --git a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000..794a1bc
--- /dev/null
+++ b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLLocallyConnectedMatrixMultiplyKernel::CLLocallyConnectedMatrixMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr)
+{
+}
+
+void CLLocallyConnectedMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+
+    _input0 = input0;
+    _input1 = input1;
+    _output = output;
+
+    if(output->info()->dimension(1) == 196)
+    {
+        _lws_hint = cl::NDRange(1, 7);
+    }
+    else
+    {
+        _lws_hint = cl::NDRange(8, 8);
+    }
+
+    std::ostringstream    mm_arguments;
+    std::set<std::string> build_opts;
+
+    mm_arguments << "-DWIDTH_VECTOR_A=" << input0->info()->dimension(0) << " ";
+    build_opts.emplace(mm_arguments.str());
+
+    // Create kernel
+    std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
+    _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_lc_vm_" + data_type_name), build_opts));
+
+    // Configure window kernel
+    const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
+
+    AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+    AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+
+    update_window_and_padding(win, input0_access, input1_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLLocallyConnectedMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    Window matrix_b_window;
+    matrix_b_window.use_tensor_dimensions(_input1->info());
+    Window slice_matrix_b = matrix_b_window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input0, slice);
+        add_3D_tensor_argument(idx, _input1, slice_matrix_b);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice, _lws_hint);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index 3668513..106a511 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -50,6 +50,7 @@
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
+    ARM_COMPUTE_ERROR_ON_MSG(norm_info.type() == NormType::IN_MAP_2D, "2D In-Map Normalization not implemented");
 
     // Set build options
     std::set<std::string> build_opts;
@@ -59,11 +60,12 @@
     _squared_input = squared_input;
     _output        = output;
 
-    const bool is_in_map = (norm_info.type() == NormType::IN_MAP);
-    _border_size         = (is_in_map) ? BorderSize(0, 3) : BorderSize(0);
+    const bool         is_in_map    = (norm_info.type() == NormType::IN_MAP_1D);
+    const unsigned int border_width = is_in_map ? std::min(norm_info.norm_size() / 2, 3U) : 0;
+    _border_size                    = BorderSize(0, border_width);
 
     // Create kernel
-    std::string kernel_name = (norm_info.type() == NormType::IN_MAP) ? "normalization_layer_in_map" : "normalization_layer_cross_map";
+    std::string kernel_name = (norm_info.type() == NormType::IN_MAP_1D) ? "normalization_layer_in_map_1D" : "normalization_layer_cross_map";
     _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
 
     // Set kernel static arguments
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index 8fe42d2..dc5ae4e 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -131,7 +131,7 @@
         };
 
         // Set static kernel arguments
-        unsigned int idx = 2 * num_arguments_per_2D_tensor();
+        unsigned int idx = 2 * num_arguments_per_3D_tensor();
         _kernel.setArg<cl_int2>(idx++, max_dims);
         _kernel.setArg<cl_int2>(idx++, strides);
         _kernel.setArg<cl_int2>(idx++, paddings);
@@ -161,7 +161,7 @@
     std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
 
-    Window slice = window.first_slice_window_2D();
+    Window slice = window.first_slice_window_3D();
 
     do
     {
@@ -172,9 +172,9 @@
 
         // Set inputs
         unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, in_slice);
-        add_2D_tensor_argument(idx, _output, slice);
+        add_3D_tensor_argument(idx, _input, in_slice);
+        add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice);
     }
-    while(window.slide_window_slice_2D(slice));
+    while(window.slide_window_slice_3D(slice));
 }
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
index 7c709bb..0470d52 100644
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
@@ -45,21 +45,27 @@
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
-    _input                                               = input;
-    _output                                              = output;
-    const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0);
+    _input  = input;
+    _output = output;
+
+    // The kernel loops over all elements in steps of 16
+    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16);
 
     // Set build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DUSE_" + string_from_data_type(input->info()->data_type())));
-    build_opts.emplace(((num_elems_processed_per_iteration % max_cl_vector_width) != 0) ? "-DNON_MULTIPLE_OF_16" : "");
+    std::set<std::string> build_opts{ "-DUSE_" + string_from_data_type(input->info()->data_type()) };
+
+    // Tell the kernel that the width is not a multiple of 16
+    if((input->info()->dimension(0) % max_cl_vector_width) != 0)
+    {
+        build_opts.emplace("-DNON_MULTIPLE_OF_16");
+    }
 
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_max", build_opts));
 
     // Set fixed arguments
     unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg<cl_uint>(idx++, num_elems_processed_per_iteration);
+    _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
 
     // Configure kernel window
     constexpr unsigned int num_elems_written_per_iteration = 1;
@@ -88,23 +94,29 @@
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, max, sum);
 
-    _input                                               = input;
-    _max                                                 = max;
-    _output                                              = output;
-    _sum                                                 = sum;
-    const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0);
+    _input  = input;
+    _max    = max;
+    _output = output;
+    _sum    = sum;
+
+    // The kernel loops over all elements in steps of 16
+    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16);
 
     // Set build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DUSE_" + string_from_data_type(input->info()->data_type())));
-    build_opts.emplace(((num_elems_processed_per_iteration % max_cl_vector_width) != 0) ? "-DNON_MULTIPLE_OF_16" : "");
+    std::set<std::string> build_opts{ "-DUSE_" + string_from_data_type(input->info()->data_type()) };
+
+    // Tell the kernel that the width is not a multiple of 16
+    if((input->info()->dimension(0) % max_cl_vector_width) != 0)
+    {
+        build_opts.emplace("-DNON_MULTIPLE_OF_16");
+    }
 
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_shift_exp_sum", build_opts));
 
     // Set fixed arguments
     unsigned int idx = 4 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg<cl_uint>(idx++, num_elems_processed_per_iteration);
+    _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
 
     // Configure window
     Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
@@ -190,10 +202,13 @@
 
     do
     {
+        Window sum_slice = slice;
+        sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1));
+
         unsigned int idx = 0;
         // Set inputs
         add_2D_tensor_argument(idx, _input, slice);
-        add_2D_tensor_argument(idx, _sum, slice);
+        add_2D_tensor_argument(idx, _sum, sum_slice);
         add_2D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice);
     }
diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
index 49931f4..2ee6fcb 100644
--- a/src/core/CL/kernels/CLTransposeKernel.cpp
+++ b/src/core/CL/kernels/CLTransposeKernel.cpp
@@ -40,11 +40,20 @@
 
 void CLTransposeKernel::configure(const ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    TensorShape  output_shape{ input->info()->tensor_shape() };
+    const size_t w_out = input->info()->dimension(1);
+    const size_t h_out = input->info()->dimension(0);
+    output_shape.set(0, w_out);
+    output_shape.set(1, h_out);
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(1));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
 
     _input    = input;
     _output   = output;
diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
new file mode 100644
index 0000000..018f272
--- /dev/null
+++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+CLWeightsReshapeKernel::CLWeightsReshapeKernel(bool is_shared)
+    : _is_shared(is_shared), _input(nullptr), _biases(nullptr), _output(nullptr)
+{
+}
+
+void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    if(_is_shared)
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(4) != (output->info()->dimension(2)));
+        ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 5);
+        ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 3);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 4);
+        ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 2);
+    }
+
+    // Check biases
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
+    }
+
+    _biases = biases;
+    _output = output;
+    _input  = input;
+
+    // Create build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.emplace(((biases != nullptr) ? "-DHAS_BIAS" : ""));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reshape_to_columns", build_opts));
+
+    // Set static arguments
+    unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
+    idx += (biases != nullptr) ? num_arguments_per_1D_tensor() : 0;
+    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0));
+    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1));
+    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(2));
+    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(3));
+
+    // Configure window
+    Window win = calculate_max_window(*input->info(), Steps());
+    // The CLWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    ICLKernel::configure(win);
+}
+
+CLConvolutionLayerWeightsReshapeKernel::CLConvolutionLayerWeightsReshapeKernel()
+    : CLWeightsReshapeKernel(false)
+{
+}
+
+void CLConvolutionLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window out_window;
+    out_window.use_tensor_dimensions(_output->info());
+
+    Window in_slice  = window.first_slice_window_3D();
+    Window out_slice = out_window.first_slice_window_2D();
+
+    // Set arguments
+    unsigned idx = 0;
+    add_3D_tensor_argument(idx, _input, in_slice);
+    add_2D_tensor_argument(idx, _output, out_slice);
+    if(_biases != nullptr)
+    {
+        Window biases_slice;
+        biases_slice.set(Window::DimX, Window::Dimension(0, _biases->info()->tensor_shape().x(), 1));
+        add_1D_tensor_argument(idx, _biases, biases_slice);
+    }
+
+    // Run kernel
+    enqueue(queue, *this, in_slice);
+}
+
+CLLocallyConnectedLayerWeightsReshapeKernel::CLLocallyConnectedLayerWeightsReshapeKernel()
+    : CLWeightsReshapeKernel(true)
+{
+}
+
+void CLLocallyConnectedLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window out_window;
+    out_window.use_tensor_dimensions(_output->info());
+
+    Window in_slice  = window.first_slice_window_3D();
+    Window out_slice = out_window.first_slice_window_2D();
+
+    Window biases_window;
+    Window biases_slice;
+
+    if(_biases != nullptr)
+    {
+        biases_window.use_tensor_dimensions(_biases->info());
+        biases_slice = biases_window.first_slice_window_1D();
+    }
+
+    do
+    {
+        // Set arguments
+        unsigned idx = 0;
+        add_3D_tensor_argument(idx, _input, in_slice);
+        add_2D_tensor_argument(idx, _output, out_slice);
+        if(_biases != nullptr)
+        {
+            add_1D_tensor_argument(idx, _biases, biases_slice);
+            biases_window.slide_window_slice_1D(biases_slice);
+        }
+
+        // Run kernel
+        enqueue(queue, *this, in_slice);
+    }
+    while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+}
diff --git a/src/core/NEON/kernels/NEHOGNonMaximaSuppressionKernel.cpp b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
similarity index 84%
rename from src/core/NEON/kernels/NEHOGNonMaximaSuppressionKernel.cpp
rename to src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
index 4ab3ddf..62bfdd6 100644
--- a/src/core/NEON/kernels/NEHOGNonMaximaSuppressionKernel.cpp
+++ b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEHOGNonMaximaSuppressionKernel.h"
+#include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -39,17 +39,17 @@
 }
 } // namespace
 
-NEHOGNonMaximaSuppressionKernel::NEHOGNonMaximaSuppressionKernel()
+CPPDetectionWindowNonMaximaSuppressionKernel::CPPDetectionWindowNonMaximaSuppressionKernel()
     : _input_output(nullptr), _min_distance(0.0f)
 {
 }
 
-bool NEHOGNonMaximaSuppressionKernel::is_parallelisable() const
+bool CPPDetectionWindowNonMaximaSuppressionKernel::is_parallelisable() const
 {
     return false;
 }
 
-void NEHOGNonMaximaSuppressionKernel::configure(IDetectionWindowArray *input_output, float min_distance)
+void CPPDetectionWindowNonMaximaSuppressionKernel::configure(IDetectionWindowArray *input_output, float min_distance)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == input_output);
 
@@ -59,7 +59,7 @@
     IKernel::configure(Window()); // Default 1 iteration window
 }
 
-void NEHOGNonMaximaSuppressionKernel::run(const Window &window)
+void CPPDetectionWindowNonMaximaSuppressionKernel::run(const Window &window)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IKernel::window(), window);
@@ -68,12 +68,12 @@
     const size_t num_candidates = _input_output->num_values();
     size_t       num_detections = 0;
 
-    /* Sort list of candidates */
+    // Sort list of candidates
     std::sort(_input_output->buffer(), _input_output->buffer() + num_candidates, compare_detection_window);
 
     const float min_distance_pow2 = _min_distance * _min_distance;
 
-    /* Euclidean distance */
+    // Euclidean distance
     for(size_t i = 0; i < num_candidates; ++i)
     {
         if(0.0f != _input_output->at(i).score)
@@ -86,7 +86,7 @@
             cur.idx_class = _input_output->at(i).idx_class;
             cur.score     = _input_output->at(i).score;
 
-            /* Store window */
+            // Store window
             _input_output->at(num_detections) = cur;
 
             ++num_detections;
@@ -108,7 +108,7 @@
 
                     if(d < min_distance_pow2)
                     {
-                        /* Invalidate keypoint */
+                        // Invalidate keypoint
                         _input_output->at(k).score = 0.0f;
                     }
                 }
diff --git a/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp b/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp
index d3e39dc..09d3ccf 100644
--- a/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp
+++ b/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp
@@ -60,7 +60,7 @@
     _output                = output;
     _min_distance          = min_distance * min_distance; // We compare squares of distances
     _num_corner_candidates = num_corner_candidates;
-    INEKernel::configure(Window()); // Default 1 iteration window
+    ICPPKernel::configure(Window()); // Default 1 iteration window
 }
 
 bool CPPSortEuclideanDistanceKernel::is_parallelisable() const
@@ -71,7 +71,7 @@
 void CPPSortEuclideanDistanceKernel::run(const Window &window)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICPPKernel::window(), window);
 
     const int32_t num_corner_candidates = *_num_corner_candidates;
 
diff --git a/src/core/Error.cpp b/src/core/Error.cpp
index ad4f343..389e390 100644
--- a/src/core/Error.cpp
+++ b/src/core/Error.cpp
@@ -25,6 +25,7 @@
 
 #include <cstdarg>
 #include <cstdio>
+#include <iostream>
 #include <stdexcept>
 
 void arm_compute::error(const char *function, const char *file, const int line, const char *msg, ...)
@@ -38,3 +39,14 @@
 
     throw std::runtime_error(std::string(out));
 }
+
+void arm_compute::debug(const char *function, const char *file, const int line, const char *msg, ...)
+{
+    char    out[512];
+    va_list args;
+    va_start(args, msg);
+    int offset = snprintf(out, sizeof(out), "in %s %s:%d: ", function, file, line);
+    vsnprintf(out + offset, sizeof(out) - offset, msg, args);
+    va_end(args);
+    std::cout << std::string(out) << std::endl;
+}
diff --git a/src/core/Helpers.cpp b/src/core/Helpers.cpp
index 9d93459..ff903e9 100644
--- a/src/core/Helpers.cpp
+++ b/src/core/Helpers.cpp
@@ -25,7 +25,7 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/IKernel.h"
-#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Utils.h"
 
 #include <algorithm>
@@ -33,7 +33,7 @@
 
 using namespace arm_compute;
 
-Window arm_compute::calculate_max_window(const TensorInfo &info, const Steps &steps, bool skip_border, BorderSize border_size)
+Window arm_compute::calculate_max_window(const ITensorInfo &info, const Steps &steps, bool skip_border, BorderSize border_size)
 {
     if(!skip_border)
     {
@@ -76,7 +76,45 @@
     return window;
 }
 
-Window arm_compute::calculate_max_window_horizontal(const TensorInfo &info, const Steps &steps, bool skip_border, BorderSize border_size)
+Window arm_compute::calculate_max_enlarged_window(const ITensorInfo &info, const Steps &steps, BorderSize border_size)
+{
+    const Coordinates &anchor = info.valid_region().anchor;
+    const TensorShape &shape  = info.valid_region().shape;
+
+    Window window;
+
+    window.set(0, Window::Dimension(
+                   // move the anchor to the start from the border
+                   anchor[0] - border_size.left,
+                   // move the anchor to include the right end border
+                   // Make sure the window width is a multiple of the step size
+                   anchor[0] - border_size.left + ceil_to_multiple(shape[0] + border_size.left + border_size.right, steps[0]),
+                   steps[0]));
+
+    size_t             n            = 1;
+    const TensorShape &tensor_shape = info.tensor_shape();
+
+    if(tensor_shape.num_dimensions() > 1)
+    {
+        window.set(1, Window::Dimension(
+                       // Include the border above the image
+                       anchor[1] - border_size.top,
+                       // Include the border below the image
+                       anchor[1] - border_size.top + ceil_to_multiple(shape[1] + border_size.top + border_size.bottom, steps[1]),
+                       steps[1]));
+
+        ++n;
+    }
+
+    for(; n < Coordinates::num_max_dimensions; ++n)
+    {
+        window.set(n, Window::Dimension(0, std::max<size_t>(1, tensor_shape[n])));
+    }
+
+    return window;
+}
+
+Window arm_compute::calculate_max_window_horizontal(const ITensorInfo &info, const Steps &steps, bool skip_border, BorderSize border_size)
 {
     if(skip_border)
     {
diff --git a/src/core/IAccessWindow.cpp b/src/core/IAccessWindow.cpp
index cbc59e6..4ddc0fe 100644
--- a/src/core/IAccessWindow.cpp
+++ b/src/core/IAccessWindow.cpp
@@ -56,7 +56,10 @@
     // Additionally the valid region is shifted by the offset that is used by
     // the kernel to write back output values.
     anchor.set(0, std::max<int>(window.x().start() * _scale_x, anchor[0] + border_size.left) + _x);
-    anchor.set(1, std::max<int>(window.y().start() * _scale_y, anchor[1] + border_size.top) + _y);
+    if(_info->num_dimensions() > 1)
+    {
+        anchor.set(1, std::max<int>(window.y().start() * _scale_y, anchor[1] + border_size.top) + _y);
+    }
 
     // End of the valid region is equal to the start of the last write of the
     // kernel plus the number of written elements. (This assumes that all
@@ -67,7 +70,10 @@
     // execution window. Afterwards the new end points are converted back into
     // a size of the region.
     shape.set(0, std::min<int>(old_anchor[0] + shape[0] - border_size.right, (window.x().end() - window.x().step()) * _scale_x + _width) - anchor[0]);
-    shape.set(1, std::min<int>(old_anchor[1] + shape[1] - border_size.bottom, (window.y().end() - window.y().step()) * _scale_y + _height) - anchor[1]);
+    if(_info->num_dimensions() > 1)
+    {
+        shape.set(1, std::min<int>(old_anchor[1] + shape[1] - border_size.bottom, (window.y().end() - window.y().step()) * _scale_y + _height) - anchor[1]);
+    }
 
     // For higher dimensions use the intersection of the window size and the
     // valid region of the input
diff --git a/src/core/ITensor.cpp b/src/core/ITensor.cpp
index 04b827e..0b29eca 100644
--- a/src/core/ITensor.cpp
+++ b/src/core/ITensor.cpp
@@ -39,8 +39,8 @@
         return;
     }
 
-    const TensorInfo *src_info = src.info();
-    TensorInfo       *dst_info = this->info();
+    const ITensorInfo *src_info = src.info();
+    ITensorInfo       *dst_info = this->info();
 
     ARM_COMPUTE_ERROR_ON(src_info->num_dimensions() > dst_info->num_dimensions());
     ARM_COMPUTE_ERROR_ON(src_info->num_channels() != dst_info->num_channels());
diff --git a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
index b3d9c6d..edb0a0f 100644
--- a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
+++ b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
@@ -132,6 +132,20 @@
 
 void NEAbsoluteDifferenceKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+    if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+    {
+        set_format_if_unknown(*output->info(), Format::S16);
+    }
+    else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+    {
+        set_format_if_unknown(*output->info(), Format::U8);
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
diff --git a/src/core/NEON/kernels/NEAccumulateKernel.cpp b/src/core/NEON/kernels/NEAccumulateKernel.cpp
index 88e9b86..e5b933a 100644
--- a/src/core/NEON/kernels/NEAccumulateKernel.cpp
+++ b/src/core/NEON/kernels/NEAccumulateKernel.cpp
@@ -248,8 +248,15 @@
 
 void NEAccumulateKernel::configure(const ITensor *input, ITensor *accum)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum);
+
+    set_shape_if_empty(*accum->info(), input->info()->tensor_shape());
+
+    set_format_if_unknown(*accum->info(), Format::S16);
+
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum);
 
     constexpr unsigned int num_elems_processed_per_iteration = 16;
     INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
@@ -276,6 +283,13 @@
 
 void NEAccumulateWeightedKernel::configure(const ITensor *input, float alpha, ITensor *accum)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum);
+
+    set_shape_if_empty(*accum->info(), input->info()->tensor_shape());
+
+    set_format_if_unknown(*accum->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON(alpha < 0.0 || alpha > 1.0);
@@ -311,6 +325,13 @@
 
 void NEAccumulateSquaredKernel::configure(const ITensor *input, uint32_t shift, ITensor *accum)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum);
+
+    set_shape_if_empty(*accum->info(), input->info()->tensor_shape());
+
+    set_format_if_unknown(*accum->info(), Format::S16);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
     ARM_COMPUTE_ERROR_ON(shift > 15);
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index e7166b0..a878078 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -23,8 +23,10 @@
  */
 #include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
 
+#include "arm_compute/core/FixedPoint.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
@@ -33,6 +35,7 @@
 
 #include <arm_neon.h>
 #include <array>
+#include <cmath>
 #include <map>
 
 using namespace arm_compute;
@@ -44,39 +47,71 @@
 
 void NEActivationLayerKernel::configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
-    static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map =
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+    // Activation functions : FP32
+    static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f32 =
     {
-        { ActivationFunction::ABS, &NEActivationLayerKernel::activation<ActivationFunction::ABS> },
-        { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation<ActivationFunction::LINEAR> },
-        { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC> },
-        { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU> },
-        { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU> },
-        { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU> },
-        { ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT> },
-        { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE> },
-        { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH> },
+        { ActivationFunction::ABS, &NEActivationLayerKernel::activation<ActivationFunction::ABS, float> },
+        { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation<ActivationFunction::LINEAR, float> },
+        { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, float> },
+        { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, float> },
+        { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, float> },
+        { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU, float> },
+        { ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT, float> },
+        { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, float> },
+        { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, float> },
     };
+
+    // Activation functions : QS8
+    static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qs8 =
+    {
+        { ActivationFunction::ABS, &NEActivationLayerKernel::activation<ActivationFunction::ABS, qint8_t> },
+        { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation<ActivationFunction::LINEAR, qint8_t> },
+        { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, qint8_t> },
+        { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, qint8_t> },
+        { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, qint8_t> },
+        { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU, qint8_t> },
+        { ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT, qint8_t> },
+        { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, qint8_t> },
+        { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, qint8_t> },
+    };
+
     _input    = input;
     _output   = output;
-    _func     = act_map[activation_info.activation()];
     _act_info = activation_info;
+    switch(input->info()->data_type())
+    {
+        case DataType::F32:
+            _func = act_map_f32[activation_info.activation()];
+            break;
+        case DataType::QS8:
+            _func = act_map_qs8[activation_info.activation()];
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
 
     constexpr unsigned int num_elems_processed_per_iteration = 16;
 
     INESimpleKernel::configure(_input, _output, num_elems_processed_per_iteration);
 }
 
-template <ActivationLayerInfo::ActivationFunction F>
-void NEActivationLayerKernel::activation(const Window &window)
+template <ActivationLayerInfo::ActivationFunction F, typename T>
+typename std::enable_if<std::is_same<T, float>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
 {
     Iterator input(_input, window);
     Iterator output(_output, window);
 
-    static const float32x4_t CONST_1 = vdupq_n_f32(1.f); // 1.f
+    static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
     static const float32x4_t CONST_0 = vdupq_n_f32(0.f);
     const float32x4_t        a       = vdupq_n_f32(_act_info.a());
     const float32x4_t        b       = vdupq_n_f32(_act_info.b());
@@ -199,6 +234,64 @@
     input, output);
 }
 
+template <ActivationLayerInfo::ActivationFunction F, typename T>
+typename std::enable_if<std::is_same<T, int8_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
+{
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+    int      fixed_point_position = _input->info()->fixed_point_position();
+
+    static const qint8x16_t CONST_0 = vdupq_n_qs8(0);
+    const qint8x16_t        CONST_1 = vdupq_n_qs8(scvt_qs8_f32(1.f, fixed_point_position));
+    const qint8x16_t        a       = vdupq_n_qs8(scvt_qs8_f32(_act_info.a(), fixed_point_position));
+    const qint8x16_t        b       = vdupq_n_qs8(scvt_qs8_f32(_act_info.b(), fixed_point_position));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto input_ptr  = reinterpret_cast<const int8_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+        const qint8x16_t in  = vld1q_qs8(input_ptr);
+        qint8x16_t       tmp = {};
+
+        switch(F)
+        {
+            case ActivationFunction::ABS:
+                tmp = vqabsq_qs8(in);
+                break;
+            case ActivationFunction::BOUNDED_RELU:
+                tmp = vminq_qs8(a, vmaxq_qs8(CONST_0, in));
+                break;
+            case ActivationFunction::LINEAR:
+                tmp = vqmlaq_qs8(b, a, in, fixed_point_position);
+                break;
+            case ActivationFunction::LOGISTIC:
+                tmp = vrecipq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(vnegq_s8(in), fixed_point_position)), fixed_point_position);
+                break;
+            case ActivationFunction::RELU:
+                tmp = vmaxq_qs8(CONST_0, in);
+                break;
+            case ActivationFunction::SOFT_RELU:
+                tmp = vlogq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(in, fixed_point_position)), fixed_point_position);
+                break;
+            case ActivationFunction::SQRT:
+                tmp = vrecipq_qs8(vinvsqrtq_qs8(in, fixed_point_position), fixed_point_position);
+                break;
+            case ActivationFunction::SQUARE:
+                tmp = vqmulq_qs8(in, in, fixed_point_position);
+                break;
+            case ActivationFunction::TANH:
+                tmp = vtanhq_qs8(in, fixed_point_position);
+                break;
+            default:
+                break;
+        }
+
+        vst1q_qs8(output_ptr, tmp);
+    },
+    input, output);
+}
+
 void NEActivationLayerKernel::run(const Window &window)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
index 55c4b76..a4fdad8 100644
--- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 
+#include <algorithm>
 #include <arm_neon.h>
 #include <cstdint>
 #include <map>
@@ -291,6 +292,20 @@
 
 void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+    if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+    {
+        set_format_if_unknown(*output->info(), Format::S16);
+    }
+    else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+    {
+        set_format_if_unknown(*output->info(), Format::F32);
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F32);
diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
index f6ee1d1..d3e62b0 100644
--- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 
+#include <algorithm>
 #include <arm_neon.h>
 #include <cstdint>
 #include <map>
@@ -284,11 +285,23 @@
 
 void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+    if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+    {
+        set_format_if_unknown(*output->info(), Format::S16);
+    }
+    else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+    {
+        set_format_if_unknown(*output->info(), Format::F32);
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F32);
-
-    /* If one of the inputs is 16bit then the output must be 16bit too: */
     ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
                              "Output can only be U8 if both inputs are U8");
 
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
new file mode 100644
index 0000000..9a216ae
--- /dev/null
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+NEBatchNormalizationLayerKernel::NEBatchNormalizationLayerKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon()
+{
+}
+
+void batch_normalization_q8(const ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
+{
+    Iterator input(in, window);
+    Iterator output(out, window);
+
+    // Hold information about the current feature map we are iterating.
+    // Only compute denominator and NEON vectors once per feature map.
+    int slice = -1;
+
+    int        fixed_point_position = in->info()->fixed_point_position();
+    const auto input_mean           = reinterpret_cast<const qint8_t *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var            = reinterpret_cast<const qint8_t *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma          = reinterpret_cast<const qint8_t *>(gamma->ptr_to_element(Coordinates(0, 0)));
+    const auto input_beta           = reinterpret_cast<const qint8_t *>(beta->ptr_to_element(Coordinates(0, 0)));
+
+    qint8x16_t       mean_vec    = vdupq_n_qs8(0);
+    qint8x16_t       var_vec     = vdupq_n_qs8(0);
+    qint8x16_t       gamma_vec   = vdupq_n_qs8(0);
+    qint8x16_t       beta_vec    = vdupq_n_qs8(0);
+    qint8x16_t       denominator = vdupq_n_qs8(0);
+    const qint8x16_t epsilon_vec = vdupq_n_qs8(scvt_qs8_f32(epsilon, fixed_point_position));
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        if(slice != id.z())
+        {
+            // Conctruct vectors
+            mean_vec  = vdupq_n_qs8(*(input_mean + id.z()));
+            var_vec   = vdupq_n_qs8(*(input_var + id.z()));
+            gamma_vec = vdupq_n_qs8(*(input_gamma + id.z()));
+            beta_vec  = vdupq_n_qs8(*(input_beta + id.z()));
+
+            // Calculate denominator
+            denominator = vqinvsqrtq_qs8(vqaddq_qs8(var_vec, epsilon_vec), fixed_point_position);
+            slice       = id.z();
+        }
+
+        // Calculate x bar and store results
+        const qint8x16_t numerator = vqsubq_qs8(vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr())), mean_vec);
+        const qint8x16_t x_bar     = vqmulq_qs8(numerator, denominator, fixed_point_position);
+        vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqmlaq_qs8(beta_vec, x_bar, gamma_vec, fixed_point_position));
+    },
+    input, output);
+}
+
+void batch_normalization_fp32(const ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
+{
+    Iterator input(in, window);
+    Iterator output(out, window);
+
+    // Hold information about the current feature map we are iterating.
+    // Only compute denominator and NEON vectors once per feature map.
+    int slice = -1;
+
+    const auto input_mean  = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var   = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma = reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0)));
+    const auto input_beta  = reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0)));
+
+    float32x4_t       mean_vec    = vdupq_n_f32(0.0);
+    float32x4_t       var_vec     = vdupq_n_f32(0.0);
+    float32x4_t       gamma_vec   = vdupq_n_f32(0.0);
+    float32x4_t       beta_vec    = vdupq_n_f32(0.0);
+    float32x4_t       denominator = vdupq_n_f32(0.0);
+    const float32x4_t epsilon_vec = vdupq_n_f32(epsilon);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        if(slice != id.z())
+        {
+            // Conctruct vectors
+            mean_vec  = vdupq_n_f32(*(input_mean + id.z()));
+            var_vec   = vdupq_n_f32(*(input_var + id.z()));
+            gamma_vec = vdupq_n_f32(*(input_gamma + id.z()));
+            beta_vec  = vdupq_n_f32(*(input_beta + id.z()));
+
+            // Calculate denominator
+            denominator = vinvsqrtq_f32(vaddq_f32(var_vec, epsilon_vec));
+            slice       = id.z();
+        }
+
+        // Calculate x bar and store results
+        const float32x4_t numerator = vsubq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), mean_vec);
+        const float32x4_t x_bar     = vmulq_f32(numerator, denominator);
+        vst1q_f32(reinterpret_cast<float *>(output.ptr()), vmlaq_f32(beta_vec, x_bar, gamma_vec));
+    },
+    input, output);
+}
+
+void NEBatchNormalizationLayerKernel::configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mean, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(var, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gamma, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(beta, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+    _input   = input;
+    _output  = output;
+    _mean    = mean;
+    _var     = var;
+    _gamma   = gamma;
+    _beta    = beta;
+    _epsilon = epsilon;
+
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    switch(input->info()->data_type())
+    {
+        case DataType::QS8:
+            _func                             = &batch_normalization_q8;
+            num_elems_processed_per_iteration = 16;
+            break;
+        case DataType::F32:
+            _func                             = &batch_normalization_fp32;
+            num_elems_processed_per_iteration = 4;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+    }
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEBatchNormalizationLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input, _output, _mean, _var, _beta, _gamma, _epsilon, window);
+}
diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
index dfde4b4..e8e448e 100644
--- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
@@ -56,9 +56,19 @@
 
 void NEBitwiseAndKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+    set_format_if_unknown(*output->info(), Format::U8);
+    set_format_if_unknown(*input1->info(), Format::U8);
+    set_format_if_unknown(*input2->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
 
     _input1 = input1;
     _input2 = input2;
diff --git a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
index 246644c..bf75592 100644
--- a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
@@ -55,8 +55,17 @@
 
 void NEBitwiseNotKernel::configure(const ITensor *input, ITensor *output)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    set_format_if_unknown(*output->info(), Format::U8);
+    set_format_if_unknown(*input->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
     _input  = input;
     _output = output;
diff --git a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
index d974202..f184be2 100644
--- a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
@@ -56,9 +56,19 @@
 
 void NEBitwiseOrKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+    set_format_if_unknown(*output->info(), Format::U8);
+    set_format_if_unknown(*input1->info(), Format::U8);
+    set_format_if_unknown(*input2->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
 
     _input1 = input1;
     _input2 = input2;
diff --git a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
index 20873b2..c4fb4c0 100644
--- a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
@@ -56,9 +56,19 @@
 
 void NEBitwiseXorKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+    set_format_if_unknown(*output->info(), Format::U8);
+    set_format_if_unknown(*input1->info(), Format::U8);
+    set_format_if_unknown(*input2->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
 
     _input1 = input1;
     _input2 = input2;
diff --git a/src/core/NEON/kernels/NEBox3x3Kernel.cpp b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
index 24b7696..d7e6d73 100644
--- a/src/core/NEON/kernels/NEBox3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
@@ -112,8 +112,17 @@
 
 void NEBox3x3Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    set_format_if_unknown(*input->info(), Format::U8);
+    set_format_if_unknown(*output->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
     _input  = input;
     _output = output;
diff --git a/src/core/NEON/kernels/NECannyEdgeKernel.cpp b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
index b722b13..85a2cd5 100644
--- a/src/core/NEON/kernels/NECannyEdgeKernel.cpp
+++ b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
@@ -725,6 +725,16 @@
 
 void NEGradientFP16Kernel::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(gx, gy, magnitude, phase);
+
+    set_shape_if_empty(*magnitude->info(), gx->info()->tensor_shape());
+    set_shape_if_empty(*phase->info(), gx->info()->tensor_shape());
+
+    Format magnitude_format = gx->info()->data_type() == DataType::S16 ? Format::U16 : Format::U32;
+    set_format_if_unknown(*magnitude->info(), magnitude_format);
+    set_format_if_unknown(*phase->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(gx, gy, magnitude, phase);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
@@ -1604,11 +1614,21 @@
 
 void NEGradientKernel::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(gx, gy, magnitude, phase);
+
+    set_shape_if_empty(*magnitude->info(), gx->info()->tensor_shape());
+    set_shape_if_empty(*phase->info(), gx->info()->tensor_shape());
+
+    Format magnitude_format = gx->info()->data_type() == DataType::S16 ? Format::U16 : Format::U32;
+    set_format_if_unknown(*magnitude->info(), magnitude_format);
+    set_format_if_unknown(*phase->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(gx, gy, magnitude, phase);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MSG(element_size_from_data_type(gx->info()->data_type()) != element_size_from_data_type(gy->info()->data_type()), "Gx and Gy must have the same element size");
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy);
     ARM_COMPUTE_ERROR_ON_MSG(element_size_from_data_type(gx->info()->data_type()) != element_size_from_data_type(magnitude->info()->data_type()), "Magnitude must have the same element size as Gx and Gy");
 
     _gx        = gx;
@@ -1687,9 +1707,18 @@
 void NEEdgeNonMaxSuppressionKernel::configure(const ITensor *magnitude, const ITensor *phase, ITensor *output,
                                               int32_t upper_thr, int32_t lower_thr, bool border_undefined)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(magnitude, phase, output);
+
+    set_shape_if_empty(*output->info(), magnitude->info()->tensor_shape());
+
+    set_format_if_unknown(*phase->info(), Format::U8);
+    set_format_if_unknown(*output->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(magnitude, phase, output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(phase, output);
 
     _magnitude = magnitude;
     _phase     = phase;
@@ -1765,8 +1794,17 @@
 
 void NEEdgeTraceKernel::configure(ITensor *input, ITensor *output)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    set_format_if_unknown(*input->info(), Format::U8);
+    set_format_if_unknown(*output->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
     _input  = input;
     _output = output;
diff --git a/src/core/NEON/kernels/NEChannelCombineKernel.cpp b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
index b894683..3147a69 100644
--- a/src/core/NEON/kernels/NEChannelCombineKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
@@ -51,13 +51,33 @@
 
 void NEChannelCombineKernel::configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
     ARM_COMPUTE_ERROR_ON(plane0 == output);
     ARM_COMPUTE_ERROR_ON(plane1 == output);
     ARM_COMPUTE_ERROR_ON(plane2 == output);
+
+    set_format_if_unknown(*plane0->info(), Format::U8);
+    set_format_if_unknown(*plane1->info(), Format::U8);
+    set_format_if_unknown(*plane2->info(), Format::U8);
+
+    if(plane3 != nullptr)
+    {
+        set_format_if_unknown(*plane3->info(), Format::U8);
+    }
+
+    set_shape_if_empty(*output->info(), plane0->info()->tensor_shape());
+
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::RGB888, Format::RGBA8888, Format::UYVY422, Format::YUYV422);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(plane0, plane1, plane2);
+
+    if(plane3 != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(plane0, plane3);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(plane0, plane3);
+    }
 
     const Format &output_format = output->info()->format();
 
@@ -102,6 +122,14 @@
             break;
     }
 
+    TensorShape subsampled_shape_plane1{ plane0->info()->tensor_shape() };
+    subsampled_shape_plane1.set(0, subsampled_shape_plane1[0] / _x_subsampling[1]);
+    TensorShape subsampled_shape_plane2{ plane0->info()->tensor_shape() };
+    subsampled_shape_plane2.set(0, subsampled_shape_plane2[0] / _x_subsampling[2]);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(plane1->info()->tensor_shape(), subsampled_shape_plane1);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(plane2->info()->tensor_shape(), subsampled_shape_plane2);
+
     Window win = calculate_max_window(*plane0->info(), Steps(_num_elems_processed_per_iteration));
 
     AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
@@ -134,14 +162,55 @@
 
 void NEChannelCombineKernel::configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0);
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1);
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    set_format_if_unknown(*plane0->info(), Format::U8);
+    set_format_if_unknown(*plane1->info(), Format::U8);
+    set_format_if_unknown(*plane2->info(), Format::U8);
+
+    set_shape_if_empty(*output->plane(0)->info(), plane0->info()->tensor_shape());
+
+    switch(output->info()->format())
+    {
+        case Format::NV12:
+        case Format::NV21:
+        case Format::IYUV:
+        {
+            TensorShape subsampled_shape = plane0->info()->tensor_shape();
+            subsampled_shape.set(0, subsampled_shape[0] / 2);
+            subsampled_shape.set(1, subsampled_shape[1] / 2);
+
+            set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->plane(1)->info()->tensor_shape(), subsampled_shape);
+
+            if(output->info()->format() == Format::IYUV)
+            {
+                set_shape_if_empty(*output->plane(2)->info(), subsampled_shape);
+
+                ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->plane(2)->info()->tensor_shape(), subsampled_shape);
+            }
+            break;
+        }
+        case Format::YUV444:
+            set_shape_if_empty(*output->plane(1)->info(), plane0->info()->tensor_shape());
+            set_shape_if_empty(*output->plane(2)->info(), plane0->info()->tensor_shape());
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(plane1, plane2, output->plane(1), output->plane(2));
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported format");
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(plane0, output->plane(0));
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(plane0, plane1, plane2);
 
     _planes[0]                            = plane0;
     _planes[1]                            = plane1;
diff --git a/src/core/NEON/kernels/NEChannelExtractKernel.cpp b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
index e772fa1..ebc4b85 100644
--- a/src/core/NEON/kernels/NEChannelExtractKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
@@ -51,23 +51,37 @@
 
 void NEChannelExtractKernel::configure(const ITensor *input, Channel channel, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_ON(input == output);
-    ARM_COMPUTE_ERROR_ON(Format::U8 != output->info()->format());
+
+    set_format_if_unknown(*output->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::RGB888, Format::RGBA8888, Format::UYVY422, Format::YUYV422);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
 
     unsigned int num_elems_processed_per_iteration = 8;
 
     // Check format and channel
-    const Format       format = input->info()->format();
-    const unsigned int subsampling(((Format::YUYV422 == format || Format::UYVY422 == format) && Channel::Y != channel) ? 2 : 1);
+    const Format       format      = input->info()->format();
+    const unsigned int subsampling = (format == Format::YUYV422 || format == Format::UYVY422) && channel != Channel::Y ? 2 : 1;
+    TensorShape        output_shape;
 
     switch(format)
     {
         case Format::RGB888:
         case Format::RGBA8888:
             num_elems_processed_per_iteration = 16;
-            _func                             = (Format::RGB888 == format) ? &NEChannelExtractKernel::extract_1C_from_3C_img : &NEChannelExtractKernel::extract_1C_from_4C_img;
+            output_shape                      = input->info()->tensor_shape();
+
+            if(format == Format::RGB888)
+            {
+                _func = &NEChannelExtractKernel::extract_1C_from_3C_img;
+            }
+            else if(format == Format::RGBA8888)
+            {
+                _func = &NEChannelExtractKernel::extract_1C_from_4C_img;
+            }
+
             switch(channel)
             {
                 case Channel::R:
@@ -80,7 +94,7 @@
                     _lut_index = 2;
                     break;
                 case Channel::A:
-                    if(Format::RGBA8888 == format)
+                    if(format == Format::RGBA8888)
                     {
                         _lut_index = 3;
                         _func      = &NEChannelExtractKernel::extract_1C_from_4C_img;
@@ -93,6 +107,13 @@
             break;
         case Format::YUYV422:
         case Format::UYVY422:
+            output_shape = input->info()->tensor_shape();
+
+            if(channel != Channel::Y)
+            {
+                output_shape.set(0, output_shape[0] / 2);
+            }
+
             switch(channel)
             {
                 case Channel::Y:
@@ -119,6 +140,11 @@
             ARM_COMPUTE_ERROR("Not supported format.");
             break;
     }
+
+    set_shape_if_empty(*output->info(), output_shape);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
     _input  = input;
     _output = output;
 
@@ -131,16 +157,47 @@
 
     ValidRegion input_valid_region = input->info()->valid_region();
 
-    output_access.set_valid_region(win, ValidRegion(std::move(input_valid_region.anchor), output->info()->tensor_shape()));
+    output_access.set_valid_region(win, ValidRegion(input_valid_region.anchor, output->info()->tensor_shape()));
 
     INEKernel::configure(win);
 }
 
 void NEChannelExtractKernel::configure(const IMultiImage *input, Channel channel, IImage *output)
 {
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-    ARM_COMPUTE_ERROR_ON(static_cast<const void *>(input) == static_cast<void *>(output));
+
+    set_format_if_unknown(*output->info(), Format::U8);
+
+    switch(input->info()->format())
+    {
+        case Format::NV12:
+        case Format::NV21:
+        case Format::IYUV:
+            switch(channel)
+            {
+                case Channel::Y:
+                    set_shape_if_empty(*output->info(), input->plane(0)->info()->tensor_shape());
+                    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output);
+                    break;
+                case Channel::U:
+                case Channel::V:
+                    set_shape_if_empty(*output->info(), input->plane(1)->info()->tensor_shape());
+                    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(1), output);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported channel for selected format");
+            }
+            break;
+        case Format::YUV444:
+            set_shape_if_empty(*output->info(), input->plane(0)->info()->tensor_shape());
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported format");
+    }
+
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
 
     unsigned int num_elems_processed_per_iteration = 32;
diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp
index 72af075..6d370ac 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.cpp
+++ b/src/core/NEON/kernels/NECol2ImKernel.cpp
@@ -36,43 +36,17 @@
 
 using namespace arm_compute;
 
-NECol2ImKernel::NECol2ImKernel()
-    : _input(nullptr), _output(nullptr), _convolved_dims()
+template <typename T>
+void NECol2ImKernel::run_col2im(const Window &window)
 {
-}
-
-void NECol2ImKernel::configure(const ITensor *input, ITensor *output, std::pair<unsigned int, unsigned int> convolved_dims)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    _input          = input;
-    _output         = output;
-    _convolved_dims = convolved_dims;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps());
-
-    // The NECol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    INEKernel::configure(win);
-}
-
-void NECol2ImKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
     const int output_stride_x = _output->info()->strides_in_bytes().x();
     const int output_stride_y = _output->info()->strides_in_bytes().y();
     const int output_stride_z = _output->info()->strides_in_bytes().z();
 
     Window window_out(window);
-    window_out.set(Window::DimX, Window::Dimension(0, 1, 0));
-    window_out.set(Window::DimY, Window::Dimension(0, 1, 0));
-    window_out.set(Window::DimZ, Window::Dimension(0, 1, 0));
+    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
 
     // Create iterators
     Iterator in(_input, window);
@@ -83,7 +57,68 @@
         const int hidx = id.y();
         const int idx  = id.x() * output_stride_z + (hidx / _convolved_dims.first) * output_stride_y + (hidx % _convolved_dims.first) * output_stride_x;
 
-        *(reinterpret_cast<float *>(out.ptr() + idx)) = *(reinterpret_cast<const float *>(in.ptr()));
+        *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
     },
     in, out);
 }
+
+NECol2ImKernel::NECol2ImKernel()
+    : _func(), _input(nullptr), _output(nullptr), _convolved_dims()
+{
+}
+
+void NECol2ImKernel::configure(const ITensor *input, ITensor *output, std::pair<unsigned int, unsigned int> convolved_dims)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    set_data_type_if_unknown(*output->info(), input->info()->data_type());
+
+    TensorShape output_shape = input->info()->tensor_shape();
+    output_shape.set(0, convolved_dims.first);
+    output_shape.set(1, convolved_dims.second);
+    output_shape.set(2, input->info()->tensor_shape()[0]);
+
+    set_shape_if_empty(*output->info(), output_shape);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input          = input;
+    _output         = output;
+    _convolved_dims = convolved_dims;
+
+    switch(input->info()->element_size())
+    {
+        case 1:
+            _func = &NECol2ImKernel::run_col2im<uint8_t>;
+            break;
+        case 2:
+            _func = &NECol2ImKernel::run_col2im<uint16_t>;
+            break;
+        case 4:
+            _func = &NECol2ImKernel::run_col2im<uint32_t>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+    }
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+
+    // The NECol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NECol2ImKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/NEColorConvertKernel.cpp b/src/core/NEON/kernels/NEColorConvertKernel.cpp
index 110a28e..cb5152e 100644
--- a/src/core/NEON/kernels/NEColorConvertKernel.cpp
+++ b/src/core/NEON/kernels/NEColorConvertKernel.cpp
@@ -44,8 +44,11 @@
 
 void NEColorConvertKernel::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON(input == nullptr);
-    ARM_COMPUTE_ERROR_ON(output == nullptr);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
 
     unsigned int num_elems_processed_per_iteration = 0;
 
@@ -137,9 +140,13 @@
 
 void NEColorConvertKernel::configure(const IMultiImage *input, IImage *output)
 {
-    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
 
+    set_shape_if_empty(*output->info(), input->plane(0)->info()->tensor_shape());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output);
+
     unsigned int num_elems_processed_per_iteration = 0;
 
     switch(input->info()->format())
@@ -241,8 +248,50 @@
 
 void NEColorConvertKernel::configure(const IImage *input, IMultiImage *output)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
 
+    set_shape_if_empty(*output->plane(0)->info(), input->info()->tensor_shape());
+
+    switch(output->info()->format())
+    {
+        case Format::NV12:
+        {
+            TensorShape subsampled_shape = input->info()->tensor_shape();
+            subsampled_shape.set(0, subsampled_shape[0] / 2);
+            subsampled_shape.set(1, subsampled_shape[1] / 2);
+
+            set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape());
+            break;
+        }
+        case Format::IYUV:
+        {
+            TensorShape subsampled_shape = input->info()->tensor_shape();
+            subsampled_shape.set(0, subsampled_shape[0] / 2);
+            subsampled_shape.set(1, subsampled_shape[1] / 2);
+
+            set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
+            set_shape_if_empty(*output->plane(2)->info(), subsampled_shape);
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape());
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(2)->info()->tensor_shape());
+            break;
+        }
+        case Format::YUV444:
+            set_shape_if_empty(*output->plane(1)->info(), input->info()->tensor_shape());
+            set_shape_if_empty(*output->plane(2)->info(), input->info()->tensor_shape());
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output->plane(1));
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output->plane(2));
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output->plane(0));
+
     unsigned int num_elems_processed_per_iteration = 0;
 
     switch(input->info()->format())
@@ -372,6 +421,50 @@
 
 void NEColorConvertKernel::configure(const IMultiImage *input, IMultiImage *output)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON(input == output);
+
+    set_shape_if_empty(*output->plane(0)->info(), input->plane(0)->info()->tensor_shape());
+
+    switch(output->info()->format())
+    {
+        case Format::NV12:
+        {
+            TensorShape subsampled_shape = input->plane(0)->info()->tensor_shape();
+            subsampled_shape.set(0, subsampled_shape[0] / 2);
+            subsampled_shape.set(1, subsampled_shape[1] / 2);
+
+            set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape());
+            break;
+        }
+        case Format::IYUV:
+        {
+            TensorShape subsampled_shape = input->plane(0)->info()->tensor_shape();
+            subsampled_shape.set(0, subsampled_shape[0] / 2);
+            subsampled_shape.set(1, subsampled_shape[1] / 2);
+
+            set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
+            set_shape_if_empty(*output->plane(2)->info(), subsampled_shape);
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape());
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(2)->info()->tensor_shape());
+            break;
+        }
+        case Format::YUV444:
+            set_shape_if_empty(*output->plane(1)->info(), input->plane(0)->info()->tensor_shape());
+            set_shape_if_empty(*output->plane(2)->info(), input->plane(0)->info()->tensor_shape());
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output->plane(1));
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output->plane(2));
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output->plane(0));
+
     switch(input->info()->format())
     {
         case Format::NV12:
diff --git a/src/core/NEON/kernels/NEConvolutionKernel.cpp b/src/core/NEON/kernels/NEConvolutionKernel.cpp
index b1b26cc..30e91ef 100644
--- a/src/core/NEON/kernels/NEConvolutionKernel.cpp
+++ b/src/core/NEON/kernels/NEConvolutionKernel.cpp
@@ -40,8 +40,8 @@
 #include <cstring>
 #include <tuple>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 const uint16x8_t max_int16 = vdupq_n_u16(INT16_MAX);
@@ -323,9 +323,13 @@
 template <unsigned int matrix_size>
 void NEConvolutionKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, bool border_undefined)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON(conv == nullptr);
 
     _input  = input;
     _output = output;
@@ -358,7 +362,6 @@
     INEKernel::configure(win);
 }
 
-#ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
 template <>
 template <typename OutputType>
 void NEConvolutionKernel<3>::convolution(const Window &win)
@@ -616,7 +619,6 @@
     },
     input, output);
 }
-#endif /* DOXYGEN_SKIP_THIS */
 
 template <unsigned int matrix_size>
 void NEConvolutionKernel<matrix_size>::run(const Window &window)
@@ -661,9 +663,13 @@
 template <unsigned int matrix_size>
 void NESeparableConvolutionHorKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv_row, bool border_undefined)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_row);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON(conv_row == nullptr);
 
     _input  = input;
     _output = output;
@@ -709,9 +715,6 @@
     }
 }
 
-#ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
-namespace arm_compute
-{
 template <>
 template <>
 inline void NESeparableConvolutionHorKernel<5>::convolve<uint16_t>(const Window &window)
@@ -1076,8 +1079,6 @@
     },
     input, output);
 }
-} // namespace arm_compute
-#endif
 
 template class arm_compute::NESeparableConvolutionHorKernel<5>;
 template class arm_compute::NESeparableConvolutionHorKernel<7>;
@@ -1098,9 +1099,13 @@
 template <unsigned int matrix_size>
 void NESeparableConvolutionVertKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv_col, uint32_t scale, bool border_undefined)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_col);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON(conv_col == nullptr);
     ARM_COMPUTE_ERROR_ON(scale == 0);
 
     _input  = input;
@@ -1417,11 +1422,15 @@
 
 void NEConvolutionRectangleKernel::configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON(nullptr == conv);
-    ARM_COMPUTE_ERROR_ON(3 != width && 5 != width && 7 != width && 9 != width);
-    ARM_COMPUTE_ERROR_ON(3 != height && 5 != height && 7 != height && 9 != height);
+    ARM_COMPUTE_ERROR_ON(width != 3 && width != 5 && width != 7 && width != 9);
+    ARM_COMPUTE_ERROR_ON(height != 3 && height != 5 && height != 7 && height != 9);
     ARM_COMPUTE_ERROR_ON(0 == scale);
 
     _input       = input;
@@ -1606,3 +1615,4 @@
     },
     input, output);
 }
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEConvolutionLayerWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEConvolutionLayerWeightsReshapeKernel.cpp
deleted file mode 100644
index e4ea960..0000000
--- a/src/core/NEON/kernels/NEConvolutionLayerWeightsReshapeKernel.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/NEConvolutionLayerWeightsReshapeKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-
-using namespace arm_compute;
-
-NEConvolutionLayerWeightsReshapeKernel::NEConvolutionLayerWeightsReshapeKernel()
-    : _input(nullptr), _bias(nullptr), _output(nullptr), _has_bias(false)
-{
-}
-
-void NEConvolutionLayerWeightsReshapeKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::F32);
-    }
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 4);
-    ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 2);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != input->info()->dimension(1));
-
-    _input    = input;
-    _bias     = bias;
-    _output   = output;
-    _has_bias = (bias != nullptr);
-
-    // Configure kernel
-    Window window = calculate_max_window(*input->info(), Steps());
-    window.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
-    window.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
-    window.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
-
-    // The NEConvolutionLayerWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    INEKernel::configure(window);
-}
-
-void NEConvolutionLayerWeightsReshapeKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    const unsigned int kernel_size     = _input->info()->dimension(0);
-    const unsigned int kernel_depth    = _input->info()->dimension(2);
-    const unsigned int input_stride_x  = _input->info()->strides_in_bytes().x();
-    const unsigned int input_stride_y  = _input->info()->strides_in_bytes().y();
-    const unsigned int input_stride_z  = _input->info()->strides_in_bytes().z();
-    const unsigned int output_stride_y = _output->info()->strides_in_bytes().y();
-
-    // Create iterators
-    Iterator in(_input, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        // Get column index
-        const int kernel_idx = id[3];
-
-        // Setup pointers
-        auto tmp_input_ptr        = in.ptr();
-        auto tmp_output_ptr       = _output->ptr_to_element(Coordinates(kernel_idx, 0));
-        auto curr_input_row_ptr   = tmp_input_ptr;
-        auto curr_input_depth_ptr = tmp_input_ptr;
-
-        // Linearize volume
-        for(unsigned int d = 0; d < kernel_depth; ++d)
-        {
-            for(unsigned int j = 0; j < kernel_size; ++j)
-            {
-                for(unsigned int i = 0; i < kernel_size; ++i)
-                {
-                    *(reinterpret_cast<float *>(tmp_output_ptr)) = *(reinterpret_cast<float *>(tmp_input_ptr));
-                    tmp_input_ptr += input_stride_x;
-                    tmp_output_ptr += output_stride_y;
-                }
-                curr_input_row_ptr += input_stride_y;
-                tmp_input_ptr = curr_input_row_ptr;
-            }
-            curr_input_depth_ptr += input_stride_z;
-            curr_input_row_ptr = curr_input_depth_ptr;
-            tmp_input_ptr      = curr_input_depth_ptr;
-        }
-
-        // Add bias
-        if(_has_bias)
-        {
-            *(reinterpret_cast<float *>(tmp_output_ptr)) = *(reinterpret_cast<float *>(_bias->ptr_to_element(Coordinates(kernel_idx, 0))));
-        }
-    },
-    in);
-}
diff --git a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
index 0160f1a..32789cb 100644
--- a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
+++ b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
@@ -47,6 +47,26 @@
     return false;
 }
 
+void NECumulativeDistributionKernel::configure(const IImage *input, const IDistribution1D *distribution, IDistribution1D *cumulative_sum, ILut *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, distribution, cumulative_sum, output);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+
+    set_format_if_unknown(*input->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON(distribution->num_bins() != cumulative_sum->num_bins());
+    ARM_COMPUTE_ERROR_ON(distribution->num_bins() != output->num_elements());
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(input->info()->data_type() != output->type());
+
+    _input          = input;
+    _distribution   = distribution;
+    _cumulative_sum = cumulative_sum;
+    _output         = output;
+
+    INEKernel::configure(calculate_max_window(*input->info()));
+}
+
 void NECumulativeDistributionKernel::run(const Window &window)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -88,19 +108,3 @@
         }
     }
 }
-
-void NECumulativeDistributionKernel::configure(const IImage *input, const IDistribution1D *distribution, IDistribution1D *cumulative_sum, ILut *output)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(nullptr == distribution);
-    ARM_COMPUTE_ERROR_ON(nullptr == cumulative_sum);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-
-    _input          = input;
-    _distribution   = distribution;
-    _cumulative_sum = cumulative_sum;
-    _output         = output;
-
-    INEKernel::configure(calculate_max_window(*input->info()));
-}
diff --git a/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp
new file mode 100644
index 0000000..902490e
--- /dev/null
+++ b/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+NEDepthConcatenateKernel::NEDepthConcatenateKernel()
+    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
+{
+}
+
+BorderSize NEDepthConcatenateKernel::border_size() const
+{
+    return BorderSize(_top_bottom, _left_right);
+}
+
+void NEDepthConcatenateKernel::configure(const ITensor *input, unsigned int depth_offset, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
+
+    // The gaps between the two lowest dimensions of input and output need to be divisible by 2
+    // Otherwise it is not clear how the padding should be added onto the input tensor
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
+
+    _input        = input;
+    _output       = output;
+    _depth_offset = depth_offset;
+    _left_right   = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
+    _top_bottom   = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
+
+    const unsigned int num_elems_processed_per_iteration = 4;
+    const unsigned int num_elems_read_per_iteration      = 4;
+    const unsigned int num_rows_read_per_iteration       = 1;
+
+    // The window needs to be based on input as we copy all the depths of input
+    Window win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_processed_per_iteration), border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEDepthConcatenateKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    // Offset output
+    const unsigned int offset_to_first_elements_in_bytes = _output->info()->offset_first_element_in_bytes() + _left_right * _output->info()->strides_in_bytes()[0] + _top_bottom *
+                                                           _output->info()->strides_in_bytes()[1] + _depth_offset * _output->info()->strides_in_bytes()[2];
+    uint8_t           *output_ptr                        = _output->buffer() + offset_to_first_elements_in_bytes;
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = reinterpret_cast<const float *>(input.ptr());
+        const auto out_ptr = reinterpret_cast<float *>(output_ptr + output.offset());
+
+        vst1q_f32(out_ptr, vld1q_f32(in_ptr));
+    },
+    input, output);
+}
diff --git a/src/core/NEON/kernels/NEDepthConvertKernel.cpp b/src/core/NEON/kernels/NEDepthConvertKernel.cpp
index eae8641..56612a7 100644
--- a/src/core/NEON/kernels/NEDepthConvertKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConvertKernel.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 
@@ -45,32 +46,27 @@
 
 void NEDepthConvertKernel::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F32);
     ARM_COMPUTE_ERROR_ON(shift >= 8);
     ARM_COMPUTE_ERROR_ON(input == output);
     ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data_types must be different");
 
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && (output->info()->data_type() != DataType::F32),
+                             "Only data_types supported [in] QS8 ->  [out] F32");
+
     ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::S16 && output->info()->data_type() != DataType::U16
-                                                                            && output->info()->data_type() != DataType::U32
                                                                             && output->info()->data_type() != DataType::S32),
-                             "Only data_types supported [in] U8 -> [out] U16, S16, U32, S32");
+                             "Only data_types supported [in] U8 -> [out] U16, S16, S32");
 
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32
-                                                                             && output->info()->data_type() != DataType::S32),
-                             "Only data_types supported [in] U16 ->  [out] U8, U32, S32");
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32),
+                             "Only data_types supported [in] U16 ->  [out] U8, U32");
 
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32
-                                                                             && output->info()->data_type() != DataType::S32),
-                             "Only data_types supported [in] S16 ->  [out] U8, U32, S32");
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::S32),
+                             "Only data_types supported [in] S16 ->  [out] U8, S32");
 
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16
-                                                                             && output->info()->data_type() != DataType::S16),
-                             "Only data_types supported [in] S16 ->  [out] U8, U16, S16");
-
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16
-                                                                             && output->info()->data_type() != DataType::S16),
-                             "Only data_types supported [in] S16 ->  [out] U8, U16, S16");
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8),
+                             "Only data_types supported [in] F32 ->  [out] QS8");
 
     _policy = policy;
     _shift  = shift;
@@ -92,6 +88,35 @@
 
     switch(_input->info()->data_type())
     {
+        case DataType::QS8:
+        {
+            const int fixed_point_position = _input->info()->fixed_point_position();
+
+            switch(_output->info()->data_type())
+            {
+                case DataType::F32:
+                {
+                    /* Up-conversion QS8 -> F32 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const int8x16_t texels_s8 = vld1q_s8(reinterpret_cast<const int8_t *>(input.ptr()));
+
+                        float32x4x2_t texels_low  = vcvt_f32_qs8(vget_low_s8(texels_s8), fixed_point_position);
+                        float32x4x2_t texels_high = vcvt_f32_qs8(vget_high_s8(texels_s8), fixed_point_position);
+
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()), texels_low.val[0]);
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, texels_low.val[1]);
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, texels_high.val[0]);
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, texels_high.val[1]);
+                    },
+                    input, output);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+        }
         case DataType::U8:
         {
             const int16x8_t b = vdupq_n_s16(_shift);
@@ -121,7 +146,7 @@
                 }
                 case DataType::S32:
                 {
-                    /* Up-conversion S16 -> S32 */
+                    /* Up-conversion U8 -> S32 */
                     execute_window_loop(window, [&](const Coordinates & id)
                     {
                         const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
@@ -214,7 +239,7 @@
                 }
                 case DataType::S32:
                 {
-                    const int16x8_t b = vdupq_n_s16(_shift);
+                    const int32x4_t b = vdupq_n_s32(_shift);
 
                     /* Up-conversion S16 -> S32 */
                     execute_window_loop(window, [&](const Coordinates & id)
@@ -222,15 +247,25 @@
                         const int16x8x2_t texels =
                         {
                             {
-                                vshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr())), b),
-                                vshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr()) + 8), b)
+                                vld1q_s16(reinterpret_cast<int16_t *>(input.ptr())),
+                                vld1q_s16(reinterpret_cast<int16_t *>(input.ptr()) + 8)
                             }
                         };
 
-                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), vmovl_s16(vget_low_s16(texels.val[0])));
-                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, vmovl_s16(vget_high_s16(texels.val[0])));
-                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, vmovl_s16(vget_low_s16(texels.val[1])));
-                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, vmovl_s16(vget_high_s16(texels.val[1])));
+                        const int32x4x4_t texels_s32 =
+                        {
+                            {
+                                vshlq_s32(vmovl_s16(vget_low_s16(texels.val[0])), b),
+                                vshlq_s32(vmovl_s16(vget_high_s16(texels.val[0])), b),
+                                vshlq_s32(vmovl_s16(vget_low_s16(texels.val[1])), b),
+                                vshlq_s32(vmovl_s16(vget_high_s16(texels.val[1])), b)
+                            }
+                        };
+
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), texels_s32.val[0]);
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, texels_s32.val[1]);
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, texels_s32.val[2]);
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, texels_s32.val[3]);
                     },
                     input, output);
                     break;
@@ -277,7 +312,7 @@
                                 }
                             };
 
-                            vst1q_u8(output.ptr(), vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
+                            vst1q_u8(output.ptr(), vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
                         },
                         input, output);
                     }
@@ -311,6 +346,38 @@
             }
             break;
         }
+        case DataType::F32:
+        {
+            switch(_output->info()->data_type())
+            {
+                case DataType::QS8:
+                {
+                    const int fixed_point_position = _output->info()->fixed_point_position();
+                    /* Down-conversion F32 -> QS8 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const float32x4x4_t texels_f32 =
+                        {
+                            {
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr())),
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 4),
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 8),
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 12)
+                            }
+                        };
+
+                        const qint8x16_t texels_s8 = vcvtq_qs8_f32(texels_f32, fixed_point_position);
+
+                        vst1q_s8(reinterpret_cast<int8_t *>(output.ptr()), texels_s8);
+                    },
+                    input, output);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+        }
         default:
             ARM_COMPUTE_ERROR("Not supported");
     }
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
new file mode 100644
index 0000000..effc50e
--- /dev/null
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace
+{
+// Internal load
+inline float32x4_t internal_vld1q(const float *in)
+{
+    return vld1q_f32(in);
+}
+inline qint8x16_t internal_vld1q(const qint8_t *in)
+{
+    return vld1q_qs8(in);
+}
+inline qint16x8_t internal_vld1q(const qint16_t *in)
+{
+    return vld1q_qs16(in);
+}
+
+// Internal store
+inline void internal_vst1q(float *p, const float32x4_t &v)
+{
+    vst1q_f32(p, v);
+}
+inline void internal_vst1q(qint8_t *p, const qint8x16_t &v)
+{
+    vst1q_qs8(p, v);
+}
+inline void internal_vst1q(qint8_t *p, const qint16x8_t &v)
+{
+    vst1_qs8(p, vqmovn_s16(v));
+}
+inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
+{
+    vst1q_qs16(p, v);
+}
+
+// Internal vdup
+inline float32x4_t internal_vdupq_n(float v)
+{
+    return vdupq_n_f32(v);
+}
+inline qint8x16_t internal_vdupq_n(qint8_t v)
+{
+    return vdupq_n_qs8(v);
+}
+inline qint16x8_t internal_vdupq_n(qint16_t v)
+{
+    return vdupq_n_qs16(v);
+}
+
+// Internal vadd
+inline float32x4_t internal_vqaddq(const float32x4_t &x, const float32x4_t &y)
+{
+    return vaddq_f32(x, y);
+}
+inline qint8x16_t internal_vqaddq(const qint8x16_t &x, const qint8x16_t &y)
+{
+    return vqaddq_qs8(x, y);
+}
+inline qint16x8_t internal_vqaddq(const qint16x8_t &x, const qint16x8_t &y)
+{
+    return vqaddq_qs16(x, y);
+}
+
+template <typename T1, typename T2, bool in_place>
+void accumulate_bias(ITensor *input, const ITensor *bias, const Window window, ITensor *output)
+{
+    Iterator in(input, window);
+
+    if(in_place) // In place accumulate
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Get bias and pointer to input
+            const auto in_ptr = reinterpret_cast<T1 *>(in.ptr());
+            const auto vb     = internal_vdupq_n(static_cast<T1>(*reinterpret_cast<const T2 *>(bias->ptr_to_element(Coordinates(id.z())))));
+
+            // Accumulate bias
+            internal_vst1q(in_ptr, internal_vqaddq(internal_vld1q(in_ptr), vb));
+        },
+        in);
+    }
+    else // Out of place accumulate
+    {
+        Iterator out(output, window);
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Get bias and pointer to input
+            const auto in_ptr  = reinterpret_cast<const T1 *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<T2 *>(out.ptr());
+            const auto vb      = internal_vdupq_n(static_cast<T1>(*reinterpret_cast<const T2 *>(bias->ptr_to_element(Coordinates(id.z())))));
+
+            // Accumulate bias
+            internal_vst1q(out_ptr, internal_vqaddq(internal_vld1q(in_ptr), vb));
+        },
+        in, out);
+    }
+}
+} // namespace
+
+NEDirectConvolutionLayerBiasAccumulateKernel::NEDirectConvolutionLayerBiasAccumulateKernel()
+    : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr)
+{
+}
+
+void NEDirectConvolutionLayerBiasAccumulateKernel::configure(ITensor *input, const ITensor *bias, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input->info()->fixed_point_position() != bias->info()->fixed_point_position());
+    if(output != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(bias, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(bias, output);
+    }
+    ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1);
+
+    _func   = nullptr;
+    _bias   = bias;
+    _input  = input;
+    _output = output;
+
+    const unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->info()->data_type());
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     bias_access(bias->info(), 0, 0, bias->info()->dimension(0), bias->info()->dimension(1));
+    if(output != nullptr)
+    {
+        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+        update_window_and_padding(win, input_access, output_access, bias_access);
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    }
+    else
+    {
+        update_window_and_padding(win, input_access, bias_access);
+        input_access.set_valid_region(win, ValidRegion(Coordinates(), input->info()->tensor_shape()));
+    }
+    INEKernel::configure(win);
+
+    // Set appropriate function
+    if(input->info()->data_type() == DataType::F32)
+    {
+        _func = (output == nullptr) ? &accumulate_bias<float, float, true> : &accumulate_bias<float, float, false>;
+    }
+    else if(input->info()->data_type() == DataType::QS8)
+    {
+        _func = (output == nullptr) ? &accumulate_bias<qint8_t, qint8_t, true> : &accumulate_bias<qint8_t, qint8_t, false>;
+    }
+    else if(input->info()->data_type() == DataType::QS16 && bias->info()->data_type() == DataType::QS8)
+    {
+        _func = (output == nullptr) ? &accumulate_bias<qint16_t, qint8_t, true> : &accumulate_bias<qint16_t, qint8_t, false>;
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+    }
+}
+
+void NEDirectConvolutionLayerBiasAccumulateKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input, _bias, window, _output);
+}
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
new file mode 100644
index 0000000..d608898
--- /dev/null
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -0,0 +1,817 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace
+{
+template <unsigned int stridex>
+float32x4_t internal_vld1q(const float *in);
+
+template <>
+float32x4_t internal_vld1q<1>(const float *in)
+{
+    return vld1q_f32(in);
+}
+
+template <>
+float32x4_t internal_vld1q<2>(const float *in)
+{
+    const float32x4x2_t tmp = vld2q_f32(in);
+    return tmp.val[0];
+}
+
+template <>
+float32x4_t internal_vld1q<3>(const float *in)
+{
+    const float32x4x3_t tmp = vld3q_f32(in);
+    return tmp.val[0];
+}
+
+template <unsigned int stridex>
+qint8x8_t internal_vld1q(const qint8_t *in);
+
+template <>
+qint8x8_t internal_vld1q<1>(const qint8_t *in)
+{
+    return vld1_qs8(in);
+}
+
+template <>
+qint8x8_t internal_vld1q<2>(const qint8_t *in)
+{
+    const qint8x8x2_t tmp = vld2_s8(in);
+    return tmp.val[0];
+}
+
+template <>
+qint8x8_t internal_vld1q<3>(const qint8_t *in)
+{
+    const qint8x8x3_t tmp = vld3_s8(in);
+    return tmp.val[0];
+}
+
+template <unsigned int stridex>
+qint16x8_t internal_vld1q(const qint16_t *in);
+
+template <>
+qint16x8_t internal_vld1q<1>(const qint16_t *in)
+{
+    return vld1q_s16(in);
+}
+
+inline float32x4_t internal_vdupq_n(float v)
+{
+    return vdupq_n_f32(v);
+}
+
+inline qint8x8_t internal_vdupq_n(qint8_t v)
+{
+    return vdup_n_qs8(v);
+}
+
+inline void internal_vst1q(float *p, const float32x4_t &v)
+{
+    vst1q_f32(p, v);
+}
+
+inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
+{
+    vst1q_qs16(p, v);
+}
+
+float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y, int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+    return vmulq_f32(x, y);
+}
+
+qint16x8_t internal_vmull(const qint8x8_t &x, const qint8x8_t &y, int fixed_point_position)
+{
+    return vmull_qs8(x, y, fixed_point_position);
+}
+
+inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z, int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+    return vmlaq_f32(x, y, z);
+}
+
+inline qint16x8_t internal_vmlal(const qint16x8_t &x, const qint8x8_t &y, const qint8x8_t &z, int fixed_point_position)
+{
+    return vqmlal_qs8(x, y, z, fixed_point_position);
+}
+
+template <typename T1, typename T2, unsigned int stridex>
+class convolver_1x1
+{
+public:
+    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+    {
+        const int          input_stride_y       = input->info()->strides_in_bytes().y();
+        const int          input_stride_z       = input->info()->strides_in_bytes().z();
+        const int          output_stride_y      = output->info()->strides_in_bytes().y();
+        const int          output_stride_z      = output->info()->strides_in_bytes().z();
+        const int          kernel_stride_z      = weights->info()->strides_in_bytes().z();
+        const int          kernel_stride_w      = weights->info()->strides_in_bytes()[3];
+        const int          output_w             = output->info()->dimension(0);
+        const int          output_h             = output->info()->dimension(1);
+        const int          range_z              = window.z().end() - window.z().start();
+        const int          kernel_depth         = weights->info()->dimension(Window::DimZ);
+        const unsigned int conv_stride_y        = std::get<1>(conv_info.stride());
+        const int          fixed_point_position = input->info()->fixed_point_position();
+
+        // setup output window for the iterator
+        Window window_out = window;
+        window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
+        window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
+        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
+
+        // setup input window for the iterator
+        Window window_in = window;
+        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
+        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        Window window_k = calculate_max_window(*weights->info(), Steps(1u));
+
+        Iterator out(output, window_out);
+        Iterator in(input, window_in);
+        Iterator k(weights, window_k);
+
+        const uint8_t *k_ptr = k.ptr();
+
+        execute_window_loop(window_out, [&](const Coordinates & id)
+        {
+            /*
+                For a detailed explanation on how the algorithm works refer to template <> class convolver_3x3<1>
+            */
+            const uint8_t *input_ptr = in.ptr();
+            uint8_t       *out_ptr   = out.ptr();
+            int            ih        = 0;
+            int            oh        = 0;
+            for(int oz = 0; oz < range_z; ++oz)
+            {
+                auto p_out_base = out_ptr + oz * output_stride_z;
+                // Step 1
+                {
+                    const auto k_val = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
+                    const auto vk    = internal_vdupq_n(*k_val);
+                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+                    {
+                        const int offset_xy = ih * input_stride_y;
+                        auto      in_val    = reinterpret_cast<const T1 *>(input_ptr + (0 * input_stride_z + offset_xy));
+                        auto      p_out     = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
+                        {
+                            internal_vst1q(p_out, internal_vmull(vk, internal_vld1q<stridex>(in_val), fixed_point_position));
+                        }
+                    }
+                }
+                // Step 2
+                for(int p = 1; p < kernel_depth; ++p)
+                {
+                    const auto k_val = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
+                    const auto vk    = internal_vdupq_n(*k_val);
+                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+                    {
+                        const int offset_xy = ih * input_stride_y;
+                        auto      in_val    = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + offset_xy);
+                        auto      p_out     = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
+                        {
+                            internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q<stridex>(in_val), fixed_point_position));
+                        }
+                    }
+                }
+            }
+        },
+        in, out);
+    }
+};
+
+inline float32x4x3_t load_matrix_row(const float *ptr)
+{
+    const float32x4x3_t r =
+    {
+        {
+            vld1q_dup_f32(ptr),
+            vld1q_dup_f32(1 + ptr),
+            vld1q_dup_f32(2 + ptr)
+        }
+    };
+    return r;
+}
+inline qint8x8x3_t load_matrix_row(const qint8_t *ptr)
+{
+    /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
+       r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
+    const qint8x8x3_t r =
+    {
+        {
+            vld1_dup_qs8(ptr),
+            vld1_dup_qs8(1 + ptr),
+            vld1_dup_qs8(2 + ptr)
+        }
+    };
+    return r;
+}
+
+template <unsigned int stridex>
+float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position);
+
+template <>
+inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+
+    const float32x4x3_t vtop =
+    {
+        {
+            vld1q_f32(in_top),
+            vld1q_f32(in_top + 4),
+            vld1q_f32(in_top + 8)
+        }
+    };
+    const float32x4x3_t vmid =
+    {
+        {
+            vld1q_f32(in_mid),
+            vld1q_f32(in_mid + 4),
+            vld1q_f32(in_mid + 8)
+        }
+    };
+    const float32x4x3_t vlow =
+    {
+        {
+            vld1q_f32(in_low),
+            vld1q_f32(in_low + 4),
+            vld1q_f32(in_low + 8)
+        }
+    };
+    float32x4x2_t out =
+    {
+        {
+            vmulq_f32(vtop.val[0], m0.val[0]),
+            vmulq_f32(vtop.val[1], m0.val[0])
+        }
+    };
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
+    out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
+    out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
+    out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
+    out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
+    return out;
+}
+
+template <>
+inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+{
+    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
+    return out;
+}
+
+template <>
+inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+{
+    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
+    return out;
+}
+
+template <unsigned int stridex>
+qint16x8x2_t convolve_3x3(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position);
+
+template <>
+inline qint16x8x2_t convolve_3x3<1>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+
+    const qint8x8x3_t vtop =
+    {
+        {
+            vld1_qs8(in_top),
+            vld1_qs8(in_top + 8),
+            vld1_qs8(in_top + 16)
+        }
+    };
+    const qint8x8x3_t vmid =
+    {
+        {
+            vld1_qs8(in_mid),
+            vld1_qs8(in_mid + 8),
+            vld1_qs8(in_mid + 16)
+        }
+    };
+    const qint8x8x3_t vlow =
+    {
+        {
+            vld1_qs8(in_low),
+            vld1_qs8(in_low + 8),
+            vld1_qs8(in_low + 16)
+        }
+    };
+    qint16x8x2_t out =
+    {
+        {
+            vmull_qs8(vtop.val[0], m0.val[0], fixed_point_position),
+            vmull_qs8(vtop.val[1], m0.val[0], fixed_point_position)
+        }
+    };
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 1), m0.val[1], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 2), m0.val[2], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vmid.val[0], m1.val[0], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 1), m1.val[1], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 2), m1.val[2], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vlow.val[0], m2.val[0], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 1), m2.val[1], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 2), m2.val[2], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 1), m0.val[1], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 2), m0.val[2], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vmid.val[1], m1.val[0], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 1), m1.val[1], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 2), m1.val[2], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vlow.val[1], m2.val[0], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 1), m2.val[1], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 2), m2.val[2], fixed_point_position);
+    return out;
+}
+
+template <>
+inline qint16x8x2_t convolve_3x3<2>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position)
+{
+    qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 2), out.val[0], 1);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 4), out.val[0], 2);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 3);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 0), out.val[0], 4);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 2), out.val[0], 5);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 4), out.val[0], 6);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 6), out.val[0], 7);
+    return out;
+}
+
+template <>
+inline qint16x8x2_t convolve_3x3<3>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position)
+{
+    qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 3), out.val[0], 1);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 2);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 1), out.val[0], 3);
+    return out;
+}
+
+template <unsigned int stridex>
+void store_results(float *buffer, const float32x4x2_t &values);
+
+template <>
+void store_results<1>(float *buffer, const float32x4x2_t &values)
+{
+    vst1q_f32(buffer, values.val[0]);
+    vst1q_f32(buffer + 4, values.val[1]);
+}
+
+template <>
+void store_results<2>(float *buffer, const float32x4x2_t &values)
+{
+    vst1q_f32(buffer, values.val[0]);
+}
+
+template <>
+void store_results<3>(float *buffer, const float32x4x2_t &values)
+{
+    vst1_f32(buffer, vget_low_f32(values.val[0]));
+}
+
+template <unsigned int stridex>
+void store_results(qint16_t *buffer, const qint16x8x2_t &values);
+
+template <>
+void store_results<1>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+    vst1q_qs16(buffer, values.val[0]);
+    vst1q_qs16(buffer + 8, values.val[1]);
+}
+
+template <>
+void store_results<2>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+    vst1q_qs16(buffer, values.val[0]);
+}
+
+template <>
+void store_results<3>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+    vst1_qs16(buffer, vget_low_s16(values.val[0]));
+}
+
+template <unsigned int stridex>
+void accumulate_results(float *buffer, const float32x4x2_t &values);
+
+template <>
+void accumulate_results<1>(float *buffer, const float32x4x2_t &values)
+{
+    vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
+    vst1q_f32(buffer + 4, vaddq_f32(vld1q_f32(buffer + 4), values.val[1]));
+}
+
+template <>
+void accumulate_results<2>(float *buffer, const float32x4x2_t &values)
+{
+    vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
+}
+
+template <>
+void accumulate_results<3>(float *buffer, const float32x4x2_t &values)
+{
+    vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0])));
+}
+
+template <unsigned int stridex>
+void accumulate_results(qint16_t *buffer, const qint16x8x2_t &values);
+
+template <>
+void accumulate_results<1>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+    vst1q_qs16(buffer, vqaddq_qs16(vld1q_qs16(buffer), values.val[0]));
+    vst1q_qs16(buffer + 8, vqaddq_qs16(vld1q_qs16(buffer + 8), values.val[1]));
+}
+
+template <>
+void accumulate_results<2>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+    vst1q_qs16(buffer, vqaddq_qs16(vld1q_qs16(buffer), values.val[0]));
+}
+
+template <>
+void accumulate_results<3>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+    vst1_qs16(buffer, vqadd_qs16(vld1_qs16(buffer), vget_low_s16(values.val[0])));
+}
+
+template <unsigned int stridex>
+int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration);
+
+template <>
+int get_input_num_elems_processed<1>(unsigned int num_elems_written_per_iteration)
+{
+    return num_elems_written_per_iteration;
+}
+
+template <>
+int get_input_num_elems_processed<2>(unsigned int num_elems_written_per_iteration)
+{
+    return num_elems_written_per_iteration << 1;
+}
+
+template <>
+int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteration)
+{
+    return num_elems_written_per_iteration * 3;
+}
+
+template <typename T1, typename T2, unsigned int stridex>
+class convolver_3x3
+{
+public:
+    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+    {
+        ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
+        const int          input_stride_x       = input->info()->strides_in_bytes().x();
+        const int          input_stride_y       = input->info()->strides_in_bytes().y();
+        const int          input_stride_z       = input->info()->strides_in_bytes().z();
+        const int          output_stride_y      = output->info()->strides_in_bytes().y();
+        const int          output_stride_z      = output->info()->strides_in_bytes().z();
+        const int          kernel_stride_x      = weights->info()->strides_in_bytes().x();
+        const int          kernel_stride_y      = weights->info()->strides_in_bytes().y();
+        const int          kernel_stride_z      = weights->info()->strides_in_bytes().z();
+        const int          kernel_stride_w      = weights->info()->strides_in_bytes()[3];
+        const int          output_w             = output->info()->dimension(0);
+        const int          output_h             = output->info()->dimension(1);
+        const int          num_planes_z         = window.z().end() - window.z().start();
+        const int          delta_input          = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
+        const int          kernel_depth         = weights->info()->dimension(Window::DimZ);
+        const unsigned int conv_stride_y        = std::get<1>(conv_info.stride());
+        const unsigned int conv_pad_x           = std::get<0>(conv_info.pad());
+        const unsigned int conv_pad_y           = std::get<1>(conv_info.pad());
+        const int          fixed_point_position = input->info()->fixed_point_position();
+
+        // setup output window for the iterator
+        Window window_out = window;
+        window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
+        window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
+        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
+
+        // setup input window for the iterator
+        Window window_in = window;
+        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
+        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        Window window_k = calculate_max_window(*weights->info(), Steps(1u));
+
+        Iterator out(output, window_out);
+        Iterator in(input, window_in);
+        Iterator k(weights, window_k);
+
+        const uint8_t *k_ptr = k.ptr();
+
+        execute_window_loop(window_out, [&](const Coordinates & id)
+        {
+            const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y;
+            uint8_t       *out_ptr   = out.ptr();
+            int            ih        = 0;
+            int            oh        = 0;
+            /*
+                    Each thread executing this kernel computes one or more output's volume planes.
+
+                    Let's say the 3rd dimension of the output volume is 32, the first thread will compute the output for Z = [0,7], the second thread will compute the output for Z = [8,15],
+                    the third thread [16,24] and the fourth thread [25,31].
+
+                    The algorithm outer loop iterates over Z, P, Y, X where P is the depth/3rd dimension of each kernel. This order is not arbitrary, the main benefit of this
+                    is that we setup the neon registers containing the kernerl's values only once and then compute each XY using the preloaded registers as opposed as doing this for every XY value.
+
+                    The algorithm does not require allocating any additional memory amd computes the results directly in-place in two stages:
+                        1) Convolve plane 0 with kernel 0 and initialize the corresponding output plane with these values.
+                        2) Convolve the remaining planes and accumulate the results in the output's plane which has been initialized in step 1.
+            */
+
+            for(int oz = 0; oz < num_planes_z; ++oz)
+            {
+                uint8_t *p_out_base = out_ptr + oz * output_stride_z;
+                // Step 1
+                {
+                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto vk_r0    = load_matrix_row(ptr_k_r0);
+                    const auto vk_r1    = load_matrix_row(ptr_k_r1);
+                    const auto vk_r2    = load_matrix_row(ptr_k_r2);
+                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+                    {
+                        auto in_top = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
+                        auto in_mid = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
+                        auto in_low = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
+                        auto p_out  = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
+                            in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
+                        {
+                            auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, fixed_point_position);
+                            store_results<stridex>(p_out, vres);
+                        }
+                    }
+                }
+                // Step 2
+                for(int p = 1; p < kernel_depth; ++p)
+                {
+                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto vk_r0    = load_matrix_row(ptr_k_r0);
+                    const auto vk_r1    = load_matrix_row(ptr_k_r1);
+                    const auto vk_r2    = load_matrix_row(ptr_k_r2);
+                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+                    {
+                        auto in_top = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 0) * input_stride_y);
+                        auto in_mid = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 1) * input_stride_y);
+                        auto in_low = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 2) * input_stride_y);
+                        auto p_out  = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
+                            in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
+                        {
+                            auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, fixed_point_position);
+                            accumulate_results<stridex>(p_out, vres);
+                        }
+                    }
+                }
+            }
+        },
+        in, out);
+    }
+};
+
+template <typename T1, typename T2>
+inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+{
+    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+    switch(conv_stride_x)
+    {
+        case 1:
+            convolver_1x1<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        case 2:
+            convolver_1x1<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        case 3:
+            convolver_1x1<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+
+template <typename T1, typename T2>
+inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+{
+    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+    switch(conv_stride_x)
+    {
+        case 1:
+            convolver_3x3<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        case 2:
+            convolver_3x3<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        case 3:
+            convolver_3x3<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+} // namespace
+
+NEDirectConvolutionLayerKernel::NEDirectConvolutionLayerKernel()
+    : _input(nullptr), _weights(nullptr), _output(nullptr), _conv_info(), _border_size(0), _kernel_size(0), _num_elems_read_per_iteration(0), _num_elems_written_per_iteration(0)
+{
+}
+
+BorderSize NEDirectConvolutionLayerKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 1 && (std::get<0>(conv_info.pad()) || std::get<1>(conv_info.pad())),
+                             "Pad > 0 not supported for 1x1 weights");
+    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 3 && (std::get<0>(conv_info.pad()) > 1 || std::get<1>(conv_info.pad()) > 1),
+                             "Pad > 1 not supported for 3x3 weights");
+    ARM_COMPUTE_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported.");
+
+    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+    const unsigned int conv_pad_x    = std::get<0>(conv_info.pad());
+    const unsigned int conv_pad_y    = std::get<1>(conv_info.pad());
+
+    _input       = input;
+    _weights     = weights;
+    _output      = output;
+    _conv_info   = conv_info;
+    _kernel_size = weights->info()->dimension(0);
+    _border_size = BorderSize(conv_pad_y, conv_pad_x);
+
+    Window win = calculate_max_window(*output->info());
+
+    switch(_kernel_size)
+    {
+        case 1:
+        {
+            _num_elems_written_per_iteration = (input->info()->data_type() == DataType::QS8) ? 8 : 4;
+            _num_elems_read_per_iteration    = conv_stride_x * _num_elems_written_per_iteration;
+
+            win = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration));
+            AccessWindowHorizontal input_access(input->info(), 0, _num_elems_read_per_iteration);
+            AccessWindowHorizontal output_access(output->info(), 0, _num_elems_written_per_iteration);
+            update_window_and_padding(win, input_access, output_access);
+            output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+            break;
+        }
+        case 3:
+        {
+            if(input->info()->data_type() == DataType::F32)
+            {
+                _num_elems_read_per_iteration    = 12;
+                _num_elems_written_per_iteration = 16 >> conv_stride_x;
+            }
+            else
+            {
+                _num_elems_read_per_iteration    = 24;
+                _num_elems_written_per_iteration = 32 >> conv_stride_x;
+            }
+
+            // Calculate right and bottom border
+            const unsigned int conv_stride_y = std::get<1>(_conv_info.stride());
+            const int          input_width   = input->info()->dimension(0);
+            const int          input_height  = input->info()->dimension(1);
+            const int          upper_bound_w = ceil_to_multiple(((output->info()->dimension(0) - 1) * conv_stride_x + _kernel_size), _num_elems_read_per_iteration) - conv_pad_x - input_width;
+            const int          upper_bound_h = ((output->info()->dimension(1) - 1) * conv_stride_y - conv_pad_y + _kernel_size) - input_height;
+            _border_size.right               = std::max(upper_bound_w, static_cast<int>(_kernel_size));
+            _border_size.bottom              = std::max(upper_bound_h, static_cast<int>(_kernel_size));
+
+            // Create window and update padding
+            win = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration));
+            AccessWindowStatic     input_access(input->info(), -conv_pad_x, -conv_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
+            AccessWindowStatic     weights_access(weights->info(), 0, 0, _kernel_size, _kernel_size);
+            AccessWindowHorizontal output_access(output->info(), 0, _num_elems_written_per_iteration);
+            update_window_and_padding(win, input_access, weights_access, output_access);
+            output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Not implemented");
+            break;
+        }
+    }
+
+    INEKernel::configure(win);
+}
+
+void NEDirectConvolutionLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+    const int kernel_size = _weights->info()->dimension(0);
+
+    switch(kernel_size)
+    {
+        case 1:
+        {
+            if(_input->info()->data_type() == DataType::QS8)
+            {
+                convolve_1x1<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+            }
+            else
+            {
+                convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+            }
+            break;
+        }
+        case 3:
+        {
+            if(_input->info()->data_type() == DataType::QS8)
+            {
+                convolve_3x3<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+            }
+            else
+            {
+                convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+            }
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Only kernel sizes 1x1 and 3x3 are supported.");
+            break;
+        }
+    }
+}
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index 5e03c32..bd99242 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -47,13 +47,15 @@
 
 void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
 
     _tensor                = tensor;
     _border_size           = border_size;
     _mode                  = border_mode;
     _constant_border_value = constant_border_value;
 
+    _border_size.limit(tensor->info()->padding());
+
     Window win;
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
     win.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -81,10 +83,15 @@
                 case DataType::U8:
                     fill_constant_value_single_channel<uint8_t>(window);
                     break;
+                case DataType::QS8:
+                case DataType::S8:
+                    fill_constant_value_single_channel<int8_t>(window);
+                    break;
                 case DataType::U16:
                     fill_constant_value_single_channel<uint16_t>(window);
                     break;
                 case DataType::S16:
+                case DataType::QS16:
                     fill_constant_value_single_channel<int16_t>(window);
                     break;
                 case DataType::U32:
@@ -109,10 +116,15 @@
                 case DataType::U8:
                     fill_replicate_single_channel<uint8_t>(window);
                     break;
+                case DataType::QS8:
+                case DataType::S8:
+                    fill_replicate_single_channel<int8_t>(window);
+                    break;
                 case DataType::U16:
                     fill_replicate_single_channel<uint16_t>(window);
                     break;
                 case DataType::S16:
+                case DataType::QS16:
                     fill_replicate_single_channel<int16_t>(window);
                     break;
                 case DataType::U32:
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index 9012060..3ff8b7b 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -132,8 +132,8 @@
 
 void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(0) * 4);
     ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != std::ceil(input->info()->dimension(1) / 4.0f));
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
index daeeee0..3558c68 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
@@ -35,7 +35,6 @@
 #include <arm_neon.h>
 #include <cstddef>
 #include <cstdint>
-#include <iostream>
 #include <tuple>
 
 using namespace arm_compute;
@@ -67,7 +66,7 @@
     _output_mult_int = output_mult_int;
     _shift           = shift;
 
-    constexpr unsigned int num_elems_processed_per_iteration_x = 4;
+    constexpr unsigned int num_elems_processed_per_iteration_x = 16;
     constexpr unsigned int num_elems_processed_per_iteration_y = 4;
 
     Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
@@ -95,9 +94,9 @@
     win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
     win_a.set(Window::DimY, Window::Dimension(window.y().start() >> 2, window.y().end() >> 2, 1));
 
-    /* Set step_x and step_y for matrix B. Scale by a factor of 4 the X range as the input transposed matrix A has 4 times less the cols of the output matrix */
+    /* Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the cols of the output matrix */
     Window win_b(window);
-    win_b.set(Window::DimX, Window::Dimension(window.x().start() >> 2, window.x().end() >> 2, in_b_stride));
+    win_b.set(Window::DimX, Window::Dimension(window.x().start() >> 4, window.x().end() >> 4, in_b_stride));
     win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
 
     /* The step x and step y for the output matrix has been already set using in configure() */
@@ -105,113 +104,320 @@
     Iterator inb(_input1, win_b);
     Iterator out(_output, window);
 
-    const int32x4_t voffset_a   = vdupq_n_s32(_a_offset);
-    const int32x4_t voffset_b   = vdupq_n_s32(_b_offset);
-    const int32x4_t voffset_out = vdupq_n_s32(_output_offset);
-    const int32x4_t vshiftr     = vdupq_n_s32(-_shift);
+    const int32x4_t voffset_a = vdupq_n_s32(_a_offset);
+    const int32x4_t voffset_b = vdupq_n_s32(_b_offset);
+    const int32x4_t vshiftr   = vdupq_n_s32(-_shift);
 
-    const int width_b   = _input1->info()->dimension(0);
-    const int max_it_16 = width_b - 16;
+    const int width_b = _input1->info()->dimension(0);
 
+    // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
+    // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
+    // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
     execute_window_loop(window, [&](const Coordinates &)
     {
-        auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr());
-        auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr());
+        const uint8_t *mtx_a0 = ina.ptr();
+        const uint8_t *mtx_b0 = inb.ptr();
 
-        int32x4x4_t c =
+        // Accumulators for the block 0
+        int32x4x4_t c0 =
         {
             {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset)
             }
         };
-        int k = 0;
-        // if max_it_16 < 0 we skip the for block and fall back to process just 4 elements
-        for(; k <= max_it_16; k += 16, mtx_a0 += 16, mtx_b0 += 16)
+
+        // Accumulators for the block 1
+        int32x4x4_t c1 =
         {
-            const int8x16_t   p00 = vld1q_s8(mtx_a0);
-            const int8x16_t   q00 = vld1q_s8(mtx_b0);
-            const int32x4x4_t ia0 =
+            {
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset)
+            }
+        };
+
+        // Accumulators for the block 2
+        int32x4x4_t c2 =
+        {
+            {
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset)
+            }
+        };
+
+        // Accumulators for the block 3
+        int32x4x4_t c3 =
+        {
+            {
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset)
+            }
+        };
+
+        int k = 0;
+        // This for loop performs 4 accumulations per iteration
+        for(; k <= (width_b - 64); k += 64, mtx_a0 += 16, mtx_b0 += 64)
+        {
+            const uint8x8_t p00  = vld1_u8(mtx_a0 + 0);
+            const uint8x8_t p01  = vld1_u8(mtx_a0 + 8);
+            const uint8x8_t q00l = vld1_u8(mtx_b0 + 0);
+            const uint8x8_t q00h = vld1_u8(mtx_b0 + 8);
+            const uint8x8_t q01l = vld1_u8(mtx_b0 + 16);
+            const uint8x8_t q01h = vld1_u8(mtx_b0 + 24);
+            const uint8x8_t q02l = vld1_u8(mtx_b0 + 32);
+            const uint8x8_t q02h = vld1_u8(mtx_b0 + 40);
+            const uint8x8_t q03l = vld1_u8(mtx_b0 + 48);
+            const uint8x8_t q03h = vld1_u8(mtx_b0 + 56);
+
+            const int32x4_t ia0l = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(p00))));
+            const int32x4_t ia0h = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(p00))));
+            const int32x4_t ia1l = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(p01))));
+            const int32x4_t ia1h = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(p01))));
+
+            const int32x2x4_t ia0 =
             {
                 {
-                    vaddw_s16(voffset_a, vget_low_s16(vmovl_s8(vget_low_s8(p00)))),
-                    vaddw_s16(voffset_a, vget_high_s16(vmovl_s8(vget_low_s8(p00)))),
-                    vaddw_s16(voffset_a, vget_low_s16(vmovl_s8(vget_high_s8(p00)))),
-                    vaddw_s16(voffset_a, vget_high_s16(vmovl_s8(vget_high_s8(p00))))
+                    vget_low_s32(ia0l),
+                    vget_high_s32(ia0l),
+                    vget_low_s32(ia0h),
+                    vget_high_s32(ia0h)
                 }
             };
+
+            const int32x2x4_t ia1 =
+            {
+                {
+                    vget_low_s32(ia1l),
+                    vget_high_s32(ia1l),
+                    vget_low_s32(ia1h),
+                    vget_high_s32(ia1h)
+                }
+            };
+
             const int32x4x4_t ib0 =
             {
                 {
-                    vaddw_s16(voffset_b, vget_low_s16(vmovl_s8(vget_low_s8(q00)))),
-                    vaddw_s16(voffset_b, vget_high_s16(vmovl_s8(vget_low_s8(q00)))),
-                    vaddw_s16(voffset_b, vget_low_s16(vmovl_s8(vget_high_s8(q00)))),
-                    vaddw_s16(voffset_b, vget_high_s16(vmovl_s8(vget_high_s8(q00))))
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00h)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00h))))
                 }
             };
-            /* Accumulation 0 */
-            c.val[0] = vmlaq_lane_s32(c.val[0], ib0.val[0], vget_low_s32(ia0.val[0]), 0);
-            c.val[1] = vmlaq_lane_s32(c.val[1], ib0.val[0], vget_low_s32(ia0.val[0]), 1);
-            c.val[2] = vmlaq_lane_s32(c.val[2], ib0.val[0], vget_high_s32(ia0.val[0]), 0);
-            c.val[3] = vmlaq_lane_s32(c.val[3], ib0.val[0], vget_high_s32(ia0.val[0]), 1);
-            /* Accumulation 1 */
-            c.val[0] = vmlaq_lane_s32(c.val[0], ib0.val[1], vget_low_s32(ia0.val[1]), 0);
-            c.val[1] = vmlaq_lane_s32(c.val[1], ib0.val[1], vget_low_s32(ia0.val[1]), 1);
-            c.val[2] = vmlaq_lane_s32(c.val[2], ib0.val[1], vget_high_s32(ia0.val[1]), 0);
-            c.val[3] = vmlaq_lane_s32(c.val[3], ib0.val[1], vget_high_s32(ia0.val[1]), 1);
-            /* Accumulation 2 */
-            c.val[0] = vmlaq_lane_s32(c.val[0], ib0.val[2], vget_low_s32(ia0.val[2]), 0);
-            c.val[1] = vmlaq_lane_s32(c.val[1], ib0.val[2], vget_low_s32(ia0.val[2]), 1);
-            c.val[2] = vmlaq_lane_s32(c.val[2], ib0.val[2], vget_high_s32(ia0.val[2]), 0);
-            c.val[3] = vmlaq_lane_s32(c.val[3], ib0.val[2], vget_high_s32(ia0.val[2]), 1);
-            /* Accumulation 3 */
-            c.val[0] = vmlaq_lane_s32(c.val[0], ib0.val[3], vget_low_s32(ia0.val[3]), 0);
-            c.val[1] = vmlaq_lane_s32(c.val[1], ib0.val[3], vget_low_s32(ia0.val[3]), 1);
-            c.val[2] = vmlaq_lane_s32(c.val[2], ib0.val[3], vget_high_s32(ia0.val[3]), 0);
-            c.val[3] = vmlaq_lane_s32(c.val[3], ib0.val[3], vget_high_s32(ia0.val[3]), 1);
-        }
-        for(; k < width_b; k += 4, mtx_a0 += 4, mtx_b0 += 4)
-        {
-            const int8x8_t  p00 = vld1_s8(mtx_a0);
-            const int8x8_t  q00 = vld1_s8(mtx_b0);
-            const int32x4_t ia0 = vaddw_s16(voffset_a, vget_low_s16(vmovl_s8(p00)));
-            const int32x4_t ib0 = vaddw_s16(voffset_b, vget_low_s16(vmovl_s8(q00)));
 
-            c.val[0] = vmlaq_lane_s32(c.val[0], ib0, vget_low_s32(ia0), 0);
-            c.val[1] = vmlaq_lane_s32(c.val[1], ib0, vget_low_s32(ia0), 1);
-            c.val[2] = vmlaq_lane_s32(c.val[2], ib0, vget_high_s32(ia0), 0);
-            c.val[3] = vmlaq_lane_s32(c.val[3], ib0, vget_high_s32(ia0), 1);
+            const int32x4x4_t ib1 =
+            {
+                {
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q01l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q01l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q01h)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q01h))))
+                }
+            };
+
+            const int32x4x4_t ib2 =
+            {
+                {
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q02l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q02l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q02h)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q02h))))
+                }
+            };
+
+            const int32x4x4_t ib3 =
+            {
+                {
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q03l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q03l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q03h)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q03h))))
+                }
+            };
+
+            // 4x4 block 0 - Accumulation 0
+            c0.val[0] = vmlaq_lane_s32(c0.val[0], ib0.val[0], ia0.val[0], 0);
+            c0.val[1] = vmlaq_lane_s32(c0.val[1], ib0.val[0], ia0.val[0], 1);
+            c0.val[2] = vmlaq_lane_s32(c0.val[2], ib0.val[0], ia0.val[1], 0);
+            c0.val[3] = vmlaq_lane_s32(c0.val[3], ib0.val[0], ia0.val[1], 1);
+            // 4x4 block 0 - Accumulation 1
+            c0.val[0] = vmlaq_lane_s32(c0.val[0], ib1.val[0], ia0.val[2], 0);
+            c0.val[1] = vmlaq_lane_s32(c0.val[1], ib1.val[0], ia0.val[2], 1);
+            c0.val[2] = vmlaq_lane_s32(c0.val[2], ib1.val[0], ia0.val[3], 0);
+            c0.val[3] = vmlaq_lane_s32(c0.val[3], ib1.val[0], ia0.val[3], 1);
+            // 4x4 block 0 - Accumulation 2
+            c0.val[0] = vmlaq_lane_s32(c0.val[0], ib2.val[0], ia1.val[0], 0);
+            c0.val[1] = vmlaq_lane_s32(c0.val[1], ib2.val[0], ia1.val[0], 1);
+            c0.val[2] = vmlaq_lane_s32(c0.val[2], ib2.val[0], ia1.val[1], 0);
+            c0.val[3] = vmlaq_lane_s32(c0.val[3], ib2.val[0], ia1.val[1], 1);
+            // 4x4 block 0 - Accumulation 3
+            c0.val[0] = vmlaq_lane_s32(c0.val[0], ib3.val[0], ia1.val[2], 0);
+            c0.val[1] = vmlaq_lane_s32(c0.val[1], ib3.val[0], ia1.val[2], 1);
+            c0.val[2] = vmlaq_lane_s32(c0.val[2], ib3.val[0], ia1.val[3], 0);
+            c0.val[3] = vmlaq_lane_s32(c0.val[3], ib3.val[0], ia1.val[3], 1);
+
+            // 4x4 block 1 - Accumulation 0
+            c1.val[0] = vmlaq_lane_s32(c1.val[0], ib0.val[1], ia0.val[0], 0);
+            c1.val[1] = vmlaq_lane_s32(c1.val[1], ib0.val[1], ia0.val[0], 1);
+            c1.val[2] = vmlaq_lane_s32(c1.val[2], ib0.val[1], ia0.val[1], 0);
+            c1.val[3] = vmlaq_lane_s32(c1.val[3], ib0.val[1], ia0.val[1], 1);
+            // 4x4 block 1 - Accumulation 1
+            c1.val[0] = vmlaq_lane_s32(c1.val[0], ib1.val[1], ia0.val[2], 0);
+            c1.val[1] = vmlaq_lane_s32(c1.val[1], ib1.val[1], ia0.val[2], 1);
+            c1.val[2] = vmlaq_lane_s32(c1.val[2], ib1.val[1], ia0.val[3], 0);
+            c1.val[3] = vmlaq_lane_s32(c1.val[3], ib1.val[1], ia0.val[3], 1);
+            // 4x4 block 1 - Accumulation 2
+            c1.val[0] = vmlaq_lane_s32(c1.val[0], ib2.val[1], ia1.val[0], 0);
+            c1.val[1] = vmlaq_lane_s32(c1.val[1], ib2.val[1], ia1.val[0], 1);
+            c1.val[2] = vmlaq_lane_s32(c1.val[2], ib2.val[1], ia1.val[1], 0);
+            c1.val[3] = vmlaq_lane_s32(c1.val[3], ib2.val[1], ia1.val[1], 1);
+            // 4x4 block 1 - Accumulation 3
+            c1.val[0] = vmlaq_lane_s32(c1.val[0], ib3.val[1], ia1.val[2], 0);
+            c1.val[1] = vmlaq_lane_s32(c1.val[1], ib3.val[1], ia1.val[2], 1);
+            c1.val[2] = vmlaq_lane_s32(c1.val[2], ib3.val[1], ia1.val[3], 0);
+            c1.val[3] = vmlaq_lane_s32(c1.val[3], ib3.val[1], ia1.val[3], 1);
+
+            // 4x4 block 2 - Accumulation 0
+            c2.val[0] = vmlaq_lane_s32(c2.val[0], ib0.val[2], ia0.val[0], 0);
+            c2.val[1] = vmlaq_lane_s32(c2.val[1], ib0.val[2], ia0.val[0], 1);
+            c2.val[2] = vmlaq_lane_s32(c2.val[2], ib0.val[2], ia0.val[1], 0);
+            c2.val[3] = vmlaq_lane_s32(c2.val[3], ib0.val[2], ia0.val[1], 1);
+            // 4x4 block 2 - Accumulation 1
+            c2.val[0] = vmlaq_lane_s32(c2.val[0], ib1.val[2], ia0.val[2], 0);
+            c2.val[1] = vmlaq_lane_s32(c2.val[1], ib1.val[2], ia0.val[2], 1);
+            c2.val[2] = vmlaq_lane_s32(c2.val[2], ib1.val[2], ia0.val[3], 0);
+            c2.val[3] = vmlaq_lane_s32(c2.val[3], ib1.val[2], ia0.val[3], 1);
+            // 4x4 block 2 - Accumulation 2
+            c2.val[0] = vmlaq_lane_s32(c2.val[0], ib2.val[2], ia1.val[0], 0);
+            c2.val[1] = vmlaq_lane_s32(c2.val[1], ib2.val[2], ia1.val[0], 1);
+            c2.val[2] = vmlaq_lane_s32(c2.val[2], ib2.val[2], ia1.val[1], 0);
+            c2.val[3] = vmlaq_lane_s32(c2.val[3], ib2.val[2], ia1.val[1], 1);
+            // 4x4 block 2 - Accumulation 3
+            c2.val[0] = vmlaq_lane_s32(c2.val[0], ib3.val[2], ia1.val[2], 0);
+            c2.val[1] = vmlaq_lane_s32(c2.val[1], ib3.val[2], ia1.val[2], 1);
+            c2.val[2] = vmlaq_lane_s32(c2.val[2], ib3.val[2], ia1.val[3], 0);
+            c2.val[3] = vmlaq_lane_s32(c2.val[3], ib3.val[2], ia1.val[3], 1);
+
+            // 4x4 block 3 - Accumulation 0
+            c3.val[0] = vmlaq_lane_s32(c3.val[0], ib0.val[3], ia0.val[0], 0);
+            c3.val[1] = vmlaq_lane_s32(c3.val[1], ib0.val[3], ia0.val[0], 1);
+            c3.val[2] = vmlaq_lane_s32(c3.val[2], ib0.val[3], ia0.val[1], 0);
+            c3.val[3] = vmlaq_lane_s32(c3.val[3], ib0.val[3], ia0.val[1], 1);
+            // 4x4 block 3 - Accumulation 1
+            c3.val[0] = vmlaq_lane_s32(c3.val[0], ib1.val[3], ia0.val[2], 0);
+            c3.val[1] = vmlaq_lane_s32(c3.val[1], ib1.val[3], ia0.val[2], 1);
+            c3.val[2] = vmlaq_lane_s32(c3.val[2], ib1.val[3], ia0.val[3], 0);
+            c3.val[3] = vmlaq_lane_s32(c3.val[3], ib1.val[3], ia0.val[3], 1);
+            // 4x4 block 3 - Accumulation 2
+            c3.val[0] = vmlaq_lane_s32(c3.val[0], ib2.val[3], ia1.val[0], 0);
+            c3.val[1] = vmlaq_lane_s32(c3.val[1], ib2.val[3], ia1.val[0], 1);
+            c3.val[2] = vmlaq_lane_s32(c3.val[2], ib2.val[3], ia1.val[1], 0);
+            c3.val[3] = vmlaq_lane_s32(c3.val[3], ib2.val[3], ia1.val[1], 1);
+            // 4x4 block 3 - Accumulation 3
+            c3.val[0] = vmlaq_lane_s32(c3.val[0], ib3.val[3], ia1.val[2], 0);
+            c3.val[1] = vmlaq_lane_s32(c3.val[1], ib3.val[3], ia1.val[2], 1);
+            c3.val[2] = vmlaq_lane_s32(c3.val[2], ib3.val[3], ia1.val[3], 0);
+            c3.val[3] = vmlaq_lane_s32(c3.val[3], ib3.val[3], ia1.val[3], 1);
         }
-        c.val[0] = vshlq_s32(vmulq_n_s32(vaddq_s32(voffset_out, c.val[0]), _output_mult_int), vshiftr);
-        c.val[1] = vshlq_s32(vmulq_n_s32(vaddq_s32(voffset_out, c.val[1]), _output_mult_int), vshiftr);
-        c.val[2] = vshlq_s32(vmulq_n_s32(vaddq_s32(voffset_out, c.val[2]), _output_mult_int), vshiftr);
-        c.val[3] = vshlq_s32(vmulq_n_s32(vaddq_s32(voffset_out, c.val[3]), _output_mult_int), vshiftr);
-        const uint8x8x2_t r =
+
+        // This for loop handles the left-over accumulations
+        for(; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
+        {
+            const uint8x8_t p00  = vld1_u8(mtx_a0);
+            const uint8x8_t q00l = vld1_u8(mtx_b0);
+            const uint8x8_t q00h = vld1_u8(mtx_b0 + 8);
+
+            const int32x4_t ia0 = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(p00))));
+
+            const int32x2x2_t ia =
+            {
+                {
+                    vget_low_s32(ia0),
+                    vget_high_s32(ia0)
+                }
+            };
+
+            const int32x4x4_t ib0 =
+            {
+                {
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00h)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00h))))
+                }
+            };
+
+            // 4x4 block 0
+            c0.val[0] = vmlaq_lane_s32(c0.val[0], ib0.val[0], ia.val[0], 0);
+            c0.val[1] = vmlaq_lane_s32(c0.val[1], ib0.val[0], ia.val[0], 1);
+            c0.val[2] = vmlaq_lane_s32(c0.val[2], ib0.val[0], ia.val[1], 0);
+            c0.val[3] = vmlaq_lane_s32(c0.val[3], ib0.val[0], ia.val[1], 1);
+
+            // 4x4 block 1
+            c1.val[0] = vmlaq_lane_s32(c1.val[0], ib0.val[1], ia.val[0], 0);
+            c1.val[1] = vmlaq_lane_s32(c1.val[1], ib0.val[1], ia.val[0], 1);
+            c1.val[2] = vmlaq_lane_s32(c1.val[2], ib0.val[1], ia.val[1], 0);
+            c1.val[3] = vmlaq_lane_s32(c1.val[3], ib0.val[1], ia.val[1], 1);
+
+            // 4x4 block 2
+            c2.val[0] = vmlaq_lane_s32(c2.val[0], ib0.val[2], ia.val[0], 0);
+            c2.val[1] = vmlaq_lane_s32(c2.val[1], ib0.val[2], ia.val[0], 1);
+            c2.val[2] = vmlaq_lane_s32(c2.val[2], ib0.val[2], ia.val[1], 0);
+            c2.val[3] = vmlaq_lane_s32(c2.val[3], ib0.val[2], ia.val[1], 1);
+
+            // 4x4 block 3
+            c3.val[0] = vmlaq_lane_s32(c3.val[0], ib0.val[3], ia.val[0], 0);
+            c3.val[1] = vmlaq_lane_s32(c3.val[1], ib0.val[3], ia.val[0], 1);
+            c3.val[2] = vmlaq_lane_s32(c3.val[2], ib0.val[3], ia.val[1], 0);
+            c3.val[3] = vmlaq_lane_s32(c3.val[3], ib0.val[3], ia.val[1], 1);
+        }
+
+        c0.val[0] = vshlq_s32(vmulq_n_s32(c0.val[0], _output_mult_int), vshiftr);
+        c0.val[1] = vshlq_s32(vmulq_n_s32(c0.val[1], _output_mult_int), vshiftr);
+        c0.val[2] = vshlq_s32(vmulq_n_s32(c0.val[2], _output_mult_int), vshiftr);
+        c0.val[3] = vshlq_s32(vmulq_n_s32(c0.val[3], _output_mult_int), vshiftr);
+
+        c1.val[0] = vshlq_s32(vmulq_n_s32(c1.val[0], _output_mult_int), vshiftr);
+        c1.val[1] = vshlq_s32(vmulq_n_s32(c1.val[1], _output_mult_int), vshiftr);
+        c1.val[2] = vshlq_s32(vmulq_n_s32(c1.val[2], _output_mult_int), vshiftr);
+        c1.val[3] = vshlq_s32(vmulq_n_s32(c1.val[3], _output_mult_int), vshiftr);
+
+        c2.val[0] = vshlq_s32(vmulq_n_s32(c2.val[0], _output_mult_int), vshiftr);
+        c2.val[1] = vshlq_s32(vmulq_n_s32(c2.val[1], _output_mult_int), vshiftr);
+        c2.val[2] = vshlq_s32(vmulq_n_s32(c2.val[2], _output_mult_int), vshiftr);
+        c2.val[3] = vshlq_s32(vmulq_n_s32(c2.val[3], _output_mult_int), vshiftr);
+
+        c3.val[0] = vshlq_s32(vmulq_n_s32(c3.val[0], _output_mult_int), vshiftr);
+        c3.val[1] = vshlq_s32(vmulq_n_s32(c3.val[1], _output_mult_int), vshiftr);
+        c3.val[2] = vshlq_s32(vmulq_n_s32(c3.val[2], _output_mult_int), vshiftr);
+        c3.val[3] = vshlq_s32(vmulq_n_s32(c3.val[3], _output_mult_int), vshiftr);
+
+        const uint8x16x4_t r =
         {
             {
-                vqmovun_s16(vcombine_s16(vqmovn_s32(c.val[0]), vqmovn_s32(c.val[1]))),
-                vqmovun_s16(vcombine_s16(vqmovn_s32(c.val[2]), vqmovn_s32(c.val[3])))
+                vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[0]), vqmovn_s32(c1.val[0]))),
+                vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[0]), vqmovn_s32(c3.val[0])))),
+                vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[1]), vqmovn_s32(c1.val[1]))),
+                vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[1]), vqmovn_s32(c3.val[1])))),
+                vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[2]), vqmovn_s32(c1.val[2]))),
+                vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[2]), vqmovn_s32(c3.val[2])))),
+                vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[3]), vqmovn_s32(c1.val[3]))),
+                vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[3]), vqmovn_s32(c3.val[3]))))
             }
         };
-        const auto mtx_out = reinterpret_cast<uint8_t *>(out.ptr());
-        vst1_lane_u8(mtx_out + 0 * out_stride + 0, r.val[0], 0);
-        vst1_lane_u8(mtx_out + 0 * out_stride + 1, r.val[0], 1);
-        vst1_lane_u8(mtx_out + 0 * out_stride + 2, r.val[0], 2);
-        vst1_lane_u8(mtx_out + 0 * out_stride + 3, r.val[0], 3);
-        vst1_lane_u8(mtx_out + 1 * out_stride + 0, r.val[0], 4);
-        vst1_lane_u8(mtx_out + 1 * out_stride + 1, r.val[0], 5);
-        vst1_lane_u8(mtx_out + 1 * out_stride + 2, r.val[0], 6);
-        vst1_lane_u8(mtx_out + 1 * out_stride + 3, r.val[0], 7);
-        vst1_lane_u8(mtx_out + 2 * out_stride + 0, r.val[1], 0);
-        vst1_lane_u8(mtx_out + 2 * out_stride + 1, r.val[1], 1);
-        vst1_lane_u8(mtx_out + 2 * out_stride + 2, r.val[1], 2);
-        vst1_lane_u8(mtx_out + 2 * out_stride + 3, r.val[1], 3);
-        vst1_lane_u8(mtx_out + 3 * out_stride + 0, r.val[1], 4);
-        vst1_lane_u8(mtx_out + 3 * out_stride + 1, r.val[1], 5);
-        vst1_lane_u8(mtx_out + 3 * out_stride + 2, r.val[1], 6);
-        vst1_lane_u8(mtx_out + 3 * out_stride + 3, r.val[1], 7);
+
+        uint8_t *const mtx_out = out.ptr();
+        vst1q_u8(mtx_out + 0 * out_stride, r.val[0]);
+        vst1q_u8(mtx_out + 1 * out_stride, r.val[1]);
+        vst1q_u8(mtx_out + 2 * out_stride, r.val[2]);
+        vst1q_u8(mtx_out + 3 * out_stride, r.val[3]);
     },
     ina, inb, out);
 }
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
index 70018ec..7a3bae5 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
@@ -44,26 +45,31 @@
 
 void NEGEMMMatrixAccumulateBiasesKernel::configure(ITensor *accum, const ITensor *biases)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
     ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1);
 
     _biases = biases;
     _accum  = accum;
 
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
 
     // Configure kernel window
     Window win = calculate_max_window(*accum->info(), Steps(num_elems_processed_per_iteration));
 
-    AccessWindowStatic output_access(biases->info(), 0, 0, biases->info()->dimension(0), biases->info()->dimension(1));
+    AccessWindowStatic biases_access(biases->info(), 0, 0, biases->info()->dimension(0), biases->info()->dimension(1));
 
     update_window_and_padding(win,
                               AccessWindowHorizontal(accum->info(), 0, num_elems_processed_per_iteration),
-                              output_access);
+                              biases_access);
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), accum->info()->tensor_shape()));
+    AccessWindowHorizontal output_access(accum->info(), 0, num_elems_processed_per_iteration);
+
+    // Set the valid region for the accum tensor
+    Coordinates coord;
+    coord.set_num_dimensions(accum->info()->num_dimensions());
+    output_access.set_valid_region(win, ValidRegion(coord, accum->info()->tensor_shape()));
 
     INEKernel::configure(win);
 }
@@ -80,12 +86,43 @@
     Iterator in0_out(_accum, window);
     Iterator in1(_biases, win_biases);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    switch(_accum->info()->data_type())
     {
-        const float32x4_t accum  = vld1q_f32(reinterpret_cast<const float *>(in0_out.ptr()));
-        const float32x4_t biases = vld1q_f32(reinterpret_cast<const float *>(in1.ptr()));
+        case DataType::F32:
+        {
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const float32x4x4_t accum  = vld4q_f32(reinterpret_cast<const float *>(in0_out.ptr()));
+                const float32x4x4_t biases = vld4q_f32(reinterpret_cast<const float *>(in1.ptr()));
+                const float32x4x4_t res =
+                {
+                    {
+                        vaddq_f32(accum.val[0], biases.val[0]),
+                        vaddq_f32(accum.val[1], biases.val[1]),
+                        vaddq_f32(accum.val[2], biases.val[2]),
+                        vaddq_f32(accum.val[3], biases.val[3])
+                    }
+                };
 
-        vst1q_f32(reinterpret_cast<float *>(in0_out.ptr()), vaddq_f32(accum, biases));
-    },
-    in0_out, in1);
+                vst4q_f32(reinterpret_cast<float *>(in0_out.ptr()), res);
+            },
+            in0_out, in1);
+            break;
+        }
+        case DataType::QS8:
+        {
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const qint8x16_t accum  = vld1q_qs8(reinterpret_cast<const qint8_t *>(in0_out.ptr()));
+                const qint8x16_t biases = vld1q_qs8(reinterpret_cast<const qint8_t *>(in1.ptr()));
+
+                vst1q_qs8(reinterpret_cast<qint8_t *>(in0_out.ptr()), vqaddq_qs8(accum, biases));
+            },
+            in0_out, in1);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Data type not supported");
+            break;
+    }
 }
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
index c6460b1..71dd4c7 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 
@@ -37,19 +38,151 @@
 class Coordinates;
 } // namespace arm_compute
 
+namespace
+{
+void matrix_addition_f32(const ITensor *input, ITensor *output, const Window &window, float beta)
+{
+    const float32x4_t beta_f32 = vdupq_n_f32(beta);
+
+    Iterator in(input, window);
+    Iterator out(output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = reinterpret_cast<const float *>(in.ptr());
+        const auto out_ptr = reinterpret_cast<float *>(out.ptr());
+
+        float32x4x4_t alpha_ab =
+        {
+            {
+                vld1q_f32(out_ptr + 0),
+                vld1q_f32(out_ptr + 4),
+                vld1q_f32(out_ptr + 8),
+                vld1q_f32(out_ptr + 12)
+            }
+        };
+
+        const float32x4x4_t c =
+        {
+            {
+                vld1q_f32(in_ptr + 0),
+                vld1q_f32(in_ptr + 4),
+                vld1q_f32(in_ptr + 8),
+                vld1q_f32(in_ptr + 12)
+            }
+        };
+
+        // Multiply matrix C by its weight and accumulate
+        alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32);
+        alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32);
+        alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32);
+        alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32);
+
+        vst1q_f32(out_ptr + 0, alpha_ab.val[0]);
+        vst1q_f32(out_ptr + 4, alpha_ab.val[1]);
+        vst1q_f32(out_ptr + 8, alpha_ab.val[2]);
+        vst1q_f32(out_ptr + 12, alpha_ab.val[3]);
+    },
+    in, out);
+}
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+void matrix_addition_f16(const ITensor *input, ITensor *output, const Window &window, float beta)
+{
+    const float16x8_t beta_f16 = vdupq_n_f16(beta);
+
+    Iterator in(input, window);
+    Iterator out(output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = reinterpret_cast<const float16_t *>(in.ptr());
+        const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
+
+        float16x8x2_t alpha_ab =
+        {
+            {
+                vld1q_f16(out_ptr + 0),
+                vld1q_f16(out_ptr + 8)
+            }
+        };
+
+        float16x8x2_t c =
+        {
+            {
+                vld1q_f16(in_ptr + 0),
+                vld1q_f16(in_ptr + 8)
+            }
+        };
+
+        // Multiply matrix C by its weight and accumulate
+        alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16));
+        alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16));
+
+        vst1q_f16(out_ptr + 0, alpha_ab.val[0]);
+        vst1q_f16(out_ptr + 8, alpha_ab.val[1]);
+    },
+    in, out);
+}
+#endif
+
+void matrix_addition_qs8(const ITensor *input, ITensor *output, const Window &window, float beta)
+{
+    const int        fixed_point_position = input->info()->fixed_point_position();
+    const qint8x16_t beta_qs8             = vdupq_n_qs8(scvt_qs8_f32(beta, fixed_point_position));
+
+    Iterator in(input, window);
+    Iterator out(output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = reinterpret_cast<const qint8_t *>(in.ptr());
+        const auto out_ptr = reinterpret_cast<qint8_t *>(out.ptr());
+
+        qint8x16_t       alpha_ab = vld1q_qs8(out_ptr);
+        const qint8x16_t c        = vld1q_qs8(in_ptr);
+
+        // Multiply matrix C by its weight and accumulate
+        alpha_ab = vqmlaq_qs8(alpha_ab, c, beta_qs8, fixed_point_position);
+
+        vst1q_qs8(out_ptr, alpha_ab);
+    },
+    in, out);
+}
+} // namespace
+
 NEGEMMMatrixAdditionKernel::NEGEMMMatrixAdditionKernel()
-    : INESimpleKernel(), _beta(0.0f)
+    : INESimpleKernel(), _func(nullptr), _beta(0.0f)
 {
 }
 
-void NEGEMMMatrixAdditionKernel::configure(const ITensor *input, ITensor *output, const float beta)
+void NEGEMMMatrixAdditionKernel::configure(const ITensor *input, ITensor *output, float beta)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
 
+    switch(input->info()->data_type())
+    {
+        case DataType::F32:
+            _func = &matrix_addition_f32;
+            break;
+        case DataType::QS8:
+            _func = &matrix_addition_qs8;
+            break;
+        case DataType::F16:
+#ifdef ARM_COMPUTE_ENABLE_FP16
+            _func = &matrix_addition_f16;
+            break;
+#endif
+        default:
+            ARM_COMPUTE_ERROR("Data type not supported");
+            break;
+    }
+
     constexpr unsigned int num_elems_processed_per_iteration = 16;
 
     INESimpleKernel::configure(input, output, num_elems_processed_per_iteration);
@@ -62,103 +195,8 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
 
-    if(0.0f != _beta)
+    if(_beta != 0.0f)
     {
-        switch(_input->info()->data_type())
-        {
-            case DataType::F32:
-            {
-                const float32x4_t beta_f32 = vdupq_n_f32(_beta);
-
-                Iterator in(_input, window);
-                Iterator out(_output, window);
-
-                execute_window_loop(window, [&](const Coordinates & id)
-                {
-                    const auto in_ptr  = reinterpret_cast<const float *>(in.ptr());
-                    const auto out_ptr = reinterpret_cast<float *>(out.ptr());
-
-                    float32x4x4_t alpha_ab =
-                    {
-                        {
-                            vld1q_f32(out_ptr + 0),
-                            vld1q_f32(out_ptr + 4),
-                            vld1q_f32(out_ptr + 8),
-                            vld1q_f32(out_ptr + 12)
-                        }
-                    };
-
-                    const float32x4x4_t c =
-                    {
-                        {
-                            vld1q_f32(in_ptr + 0),
-                            vld1q_f32(in_ptr + 4),
-                            vld1q_f32(in_ptr + 8),
-                            vld1q_f32(in_ptr + 12)
-                        }
-                    };
-
-                    /* Multiply matrix C by its weight and accumulate */
-                    alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32);
-                    alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32);
-                    alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32);
-                    alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32);
-
-                    vst1q_f32(out_ptr + 0, alpha_ab.val[0]);
-                    vst1q_f32(out_ptr + 4, alpha_ab.val[1]);
-                    vst1q_f32(out_ptr + 8, alpha_ab.val[2]);
-                    vst1q_f32(out_ptr + 12, alpha_ab.val[3]);
-                },
-                in, out);
-
-                break;
-            }
-            case DataType::F16:
-#ifdef ARM_COMPUTE_ENABLE_FP16
-                {
-                    const float16x8_t beta_f16 = vdupq_n_f16(_beta);
-
-                    Iterator in(_input, window);
-                    Iterator out(_output, window);
-
-                    execute_window_loop(window, [&](const Coordinates & id)
-                    {
-                        const auto in_ptr  = reinterpret_cast<const float16_t *>(in.ptr());
-                        const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
-
-                        float16x8x2_t alpha_ab =
-                        {
-                            {
-                                vld1q_f16(out_ptr + 0),
-                                vld1q_f16(out_ptr + 8)
-                            }
-                        };
-
-                        float16x8x2_t c =
-                        {
-                            {
-                                vld1q_f16(in_ptr + 0),
-                                vld1q_f16(in_ptr + 8)
-                            }
-                        };
-
-                        /* Multiply matrix C by its weight and accumulate */
-                        alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16));
-                        alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16));
-
-                        vst1q_f16(out_ptr + 0, alpha_ab.val[0]);
-                        vst1q_f16(out_ptr + 8, alpha_ab.val[1]);
-                    },
-                    in, out);
-
-                    break;
-                }
-#endif
-            default:
-            {
-                ARM_COMPUTE_ERROR("Data type not supported");
-                break;
-            }
-        }
+        (*_func)(_input, _output, window, _beta);
     }
 }
diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
index 53f4eed..dcfbb13 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
@@ -55,10 +56,11 @@
     const auto in_b_stride     = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
     const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
 
+    // The implementation computes 16 elements per iteration
     const int window_start_x = 16 * window.thread_id();
     const int window_step_x  = 16 * window.num_threads();
     // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
-    const int window_end_x = width_matrix_b + (window_step_x - ((width_matrix_b - window_start_x) % window_step_x)) % window_step_x;
+    const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
 
     Window win_out(window);
     win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
@@ -82,8 +84,6 @@
     Iterator inb(input1, win_b);
     Iterator out(output, win_out);
 
-    const auto vec_a = reinterpret_cast<const float *>(ina.ptr());
-
     execute_window_loop(win_out, [&](const Coordinates & id)
     {
         if(id.x() > width_matrix_b)
@@ -96,33 +96,37 @@
         float32x4_t acc2 = vdupq_n_f32(0.f);
         float32x4_t acc3 = vdupq_n_f32(0.f);
 
-        const auto matrix_b = reinterpret_cast<const float *>(inb.ptr());
+        auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
+        auto matrix_b = reinterpret_cast<const float *>(inb.ptr());
 
-        int i = 0;
-        for(; i <= (num_elems_vec_a - 4); i += 4)
+#if __arm__
+        asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+        asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
+        asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
+#endif
+
+        auto vec_a_end_addr = vec_a + num_elems_vec_a;
+        for(; vec_a <= (vec_a_end_addr - 4);)
         {
-            const float32x2_t a0l = vld1_f32(&vec_a[i]);
-            const float32x2_t a0h = vld1_f32(&vec_a[i] + 2);
+            float32x2_t a0l = vld1_f32(vec_a);
 
-            const float32x4_t b00 = vld1q_f32(&matrix_b[0 + (i + 0) * in_b_stride]);
-            const float32x4_t b01 = vld1q_f32(&matrix_b[4 + (i + 0) * in_b_stride]);
-            const float32x4_t b02 = vld1q_f32(&matrix_b[8 + (i + 0) * in_b_stride]);
-            const float32x4_t b03 = vld1q_f32(&matrix_b[12 + (i + 0) * in_b_stride]);
+            float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+            float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+            float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+            float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
 
-            const float32x4_t b10 = vld1q_f32(&matrix_b[0 + (i + 1) * in_b_stride]);
-            const float32x4_t b11 = vld1q_f32(&matrix_b[4 + (i + 1) * in_b_stride]);
-            const float32x4_t b12 = vld1q_f32(&matrix_b[8 + (i + 1) * in_b_stride]);
-            const float32x4_t b13 = vld1q_f32(&matrix_b[12 + (i + 1) * in_b_stride]);
+            float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+            float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+            float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+            float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
 
-            const float32x4_t b20 = vld1q_f32(&matrix_b[0 + (i + 2) * in_b_stride]);
-            const float32x4_t b21 = vld1q_f32(&matrix_b[4 + (i + 2) * in_b_stride]);
-            const float32x4_t b22 = vld1q_f32(&matrix_b[8 + (i + 2) * in_b_stride]);
-            const float32x4_t b23 = vld1q_f32(&matrix_b[12 + (i + 2) * in_b_stride]);
-
-            const float32x4_t b30 = vld1q_f32(&matrix_b[0 + (i + 3) * in_b_stride]);
-            const float32x4_t b31 = vld1q_f32(&matrix_b[4 + (i + 3) * in_b_stride]);
-            const float32x4_t b32 = vld1q_f32(&matrix_b[8 + (i + 3) * in_b_stride]);
-            const float32x4_t b33 = vld1q_f32(&matrix_b[12 + (i + 3) * in_b_stride]);
+#if __arm__
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
+#endif
 
             acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
             acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
@@ -134,30 +138,51 @@
             acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
             acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
 
-            acc0 = vmlaq_lane_f32(acc0, b20, a0h, 0);
-            acc1 = vmlaq_lane_f32(acc1, b21, a0h, 0);
-            acc2 = vmlaq_lane_f32(acc2, b22, a0h, 0);
-            acc3 = vmlaq_lane_f32(acc3, b23, a0h, 0);
+            vec_a += 2;
+            matrix_b += 2 * in_b_stride;
 
-            acc0 = vmlaq_lane_f32(acc0, b30, a0h, 1);
-            acc1 = vmlaq_lane_f32(acc1, b31, a0h, 1);
-            acc2 = vmlaq_lane_f32(acc2, b32, a0h, 1);
-            acc3 = vmlaq_lane_f32(acc3, b33, a0h, 1);
+            a0l = vld1_f32(vec_a);
+
+            b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+            b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+            b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+            b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+            b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+            b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+            b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+            b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+
+            acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+            acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+            acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+            acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+
+            acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+            acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+            acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+            acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+
+            vec_a += 2;
+            matrix_b += 2 * in_b_stride;
         }
 
-        for(; i < num_elems_vec_a; i++)
+        for(; vec_a < vec_a_end_addr;)
         {
-            const float a0 = vec_a[i];
+            const float a0 = *vec_a;
 
-            const float32x4_t b00 = vld1q_f32(&matrix_b[0 + i * in_b_stride]);
-            const float32x4_t b01 = vld1q_f32(&matrix_b[4 + i * in_b_stride]);
-            const float32x4_t b02 = vld1q_f32(&matrix_b[8 + i * in_b_stride]);
-            const float32x4_t b03 = vld1q_f32(&matrix_b[12 + i * in_b_stride]);
+            const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+            const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+            const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+            const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
 
             acc0 = vmlaq_n_f32(acc0, b00, a0);
             acc1 = vmlaq_n_f32(acc1, b01, a0);
             acc2 = vmlaq_n_f32(acc2, b02, a0);
             acc3 = vmlaq_n_f32(acc3, b03, a0);
+
+            vec_a += 1;
+            matrix_b += in_b_stride;
         }
 
         // Multiply by the weight of matrix product (alpha)
@@ -172,12 +197,140 @@
 
         const auto vec_out = reinterpret_cast<float *>(out.ptr());
 
-        vst1q_f32(&vec_out[0], acc0);
-        vst1q_f32(&vec_out[4], acc1);
-        vst1q_f32(&vec_out[8], acc2);
-        vst1q_f32(&vec_out[12], acc3);
+        vst1q_f32(vec_out + 0, acc0);
+        vst1q_f32(vec_out + 4, acc1);
+        vst1q_f32(vec_out + 8, acc2);
+        vst1q_f32(vec_out + 12, acc3);
     },
-    inb, out);
+    ina, inb, out);
+}
+
+template <bool multiply_alpha>
+void vector_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
+{
+    const auto width_matrix_b       = static_cast<int>(output->info()->dimension(0));
+    const auto in_b_stride          = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
+    const auto num_elems_vec_a      = static_cast<int>(input0->info()->dimension(0));
+    const int  fixed_point_position = input0->info()->fixed_point_position();
+
+    // The implementation computes 32 elements per iteration
+    const int window_start_x = 32 * window.thread_id();
+    const int window_step_x  = 32 * window.num_threads();
+    // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+    const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+
+    Window win_out(window);
+    win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+    win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Window win_b;
+    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+    if(input1->info()->num_dimensions() >= 3)
+    {
+        win_b = window;
+    }
+    win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+    win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Iterator ina(input0, win_a);
+    Iterator inb(input1, win_b);
+    Iterator out(output, win_out);
+
+    execute_window_loop(win_out, [&](const Coordinates & id)
+    {
+        if(id.x() > width_matrix_b)
+        {
+            return;
+        }
+
+        // Reset accumulators
+        qint16x8_t acc00_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc01_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc02_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc03_qs16 = vdupq_n_qs16(0);
+
+        auto vec_a    = reinterpret_cast<const qint8_t *>(ina.ptr());
+        auto matrix_b = reinterpret_cast<const qint8_t *>(inb.ptr());
+
+        auto vec_a_end_addr = vec_a + num_elems_vec_a;
+        for(; vec_a <= (vec_a_end_addr - 2);)
+        {
+            const qint8x8_t a0 = vld1_dup_qs8(vec_a + 0);
+            const qint8x8_t a1 = vld1_dup_qs8(vec_a + 1);
+
+            const qint8x8_t b00 = vld1_qs8(matrix_b + 0 + 0 * in_b_stride);
+            const qint8x8_t b01 = vld1_qs8(matrix_b + 8 + 0 * in_b_stride);
+            const qint8x8_t b02 = vld1_qs8(matrix_b + 16 + 0 * in_b_stride);
+            const qint8x8_t b03 = vld1_qs8(matrix_b + 24 + 0 * in_b_stride);
+            const qint8x8_t b10 = vld1_qs8(matrix_b + 0 + 1 * in_b_stride);
+            const qint8x8_t b11 = vld1_qs8(matrix_b + 8 + 1 * in_b_stride);
+            const qint8x8_t b12 = vld1_qs8(matrix_b + 16 + 1 * in_b_stride);
+            const qint8x8_t b13 = vld1_qs8(matrix_b + 24 + 1 * in_b_stride);
+
+            // First accumulation
+            acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
+            acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
+            acc02_qs16 = vqmlal_qs8(acc02_qs16, b02, a0, fixed_point_position);
+            acc03_qs16 = vqmlal_qs8(acc03_qs16, b03, a0, fixed_point_position);
+
+            // Second accumulation
+            acc00_qs16 = vqmlal_qs8(acc00_qs16, b10, a1, fixed_point_position);
+            acc01_qs16 = vqmlal_qs8(acc01_qs16, b11, a1, fixed_point_position);
+            acc02_qs16 = vqmlal_qs8(acc02_qs16, b12, a1, fixed_point_position);
+            acc03_qs16 = vqmlal_qs8(acc03_qs16, b13, a1, fixed_point_position);
+
+            vec_a += 2;
+            matrix_b += 2 * in_b_stride;
+        }
+
+        for(; vec_a < vec_a_end_addr;)
+        {
+            const qint8x8_t a0 = vld1_dup_qs8(vec_a);
+
+            const qint8x8_t b00 = vld1_qs8(matrix_b + 0);
+            const qint8x8_t b01 = vld1_qs8(matrix_b + 8);
+            const qint8x8_t b02 = vld1_qs8(matrix_b + 16);
+            const qint8x8_t b03 = vld1_qs8(matrix_b + 24);
+
+            acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
+            acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
+            acc02_qs16 = vqmlal_qs8(acc02_qs16, b02, a0, fixed_point_position);
+            acc03_qs16 = vqmlal_qs8(acc03_qs16, b03, a0, fixed_point_position);
+
+            vec_a += 1;
+            matrix_b += in_b_stride;
+        }
+
+        // Convert back to qint8x8_t and saturate
+        qint8x8_t acc00_qs8 = vqmovn_qs16(acc00_qs16);
+        qint8x8_t acc01_qs8 = vqmovn_qs16(acc01_qs16);
+        qint8x8_t acc02_qs8 = vqmovn_qs16(acc02_qs16);
+        qint8x8_t acc03_qs8 = vqmovn_qs16(acc03_qs16);
+
+        // Multiply by the weight of the matrix product (alpha)
+        if(multiply_alpha)
+        {
+            const qint8x8_t alpha_qs8 = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position));
+            acc00_qs8                 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position);
+            acc01_qs8                 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position);
+            acc02_qs8                 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position);
+            acc03_qs8                 = vqmul_qs8(acc03_qs8, alpha_qs8, fixed_point_position);
+        }
+
+        const auto mtx_out0 = reinterpret_cast<qint8_t *>(out.ptr());
+
+        // Store 8x4 output elements
+        vst1_qs8(mtx_out0 + 0, acc00_qs8);
+        vst1_qs8(mtx_out0 + 8, acc01_qs8);
+        vst1_qs8(mtx_out0 + 16, acc02_qs8);
+        vst1_qs8(mtx_out0 + 24, acc03_qs8);
+    },
+    ina, inb, out);
 }
 
 template <bool multiply_alpha>
@@ -202,9 +355,9 @@
         win_b = window;
     }
     // Set step_x and step_y for matrix B. Scale by a factor of 4 the X range as the input transposed matrix A has 4 times less the cols of the output matrix
-    // The step along the x direction is 4 times the in_b_stride because for each iteration we compute 4 blocks of size 4x4
-    win_b.set(Window::DimX, Window::Dimension(window.x().start() / 4, window.x().end() / 4, 4 * in_b_stride));
-    win_b.set(Window::DimY, Window::Dimension(0, 1, 0));
+    // The step along the x direction is 2 times the in_b_stride because for each iteration we compute 2 blocks of size 4x4
+    win_b.set(Window::DimX, Window::Dimension(window.x().start() / 4, window.x().end() / 4, 2 * in_b_stride));
+    win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
 
     Iterator ina(input0, win_a);
     Iterator inb(input1, win_b);
@@ -218,8 +371,6 @@
         auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr());
         auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr());
         auto mtx_b1 = mtx_b0 + in_b_stride;
-        auto mtx_b2 = mtx_b1 + in_b_stride;
-        auto mtx_b3 = mtx_b2 + in_b_stride;
 
         float32x4_t acc00 = vdupq_n_f32(0.f);
         float32x4_t acc10 = vdupq_n_f32(0.f);
@@ -231,55 +382,227 @@
         float32x4_t acc21 = vdupq_n_f32(0.f);
         float32x4_t acc31 = vdupq_n_f32(0.f);
 
-        float32x4_t acc02 = vdupq_n_f32(0.f);
-        float32x4_t acc12 = vdupq_n_f32(0.f);
-        float32x4_t acc22 = vdupq_n_f32(0.f);
-        float32x4_t acc32 = vdupq_n_f32(0.f);
+#if __arm__
+        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif
 
-        float32x4_t acc03 = vdupq_n_f32(0.f);
-        float32x4_t acc13 = vdupq_n_f32(0.f);
-        float32x4_t acc23 = vdupq_n_f32(0.f);
-        float32x4_t acc33 = vdupq_n_f32(0.f);
-
-        for(int k = 0; k < num_elems_matrix_b_x; k += 4)
+        auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
+        for(; mtx_b0 <= (mtx_b0_end_addr - 32);)
         {
-            const float32x4_t a    = vld1q_f32(mtx_a0);
-            const float32x2_t a00l = vget_low_f32(a);
-            const float32x2_t a00h = vget_high_f32(a);
-            const float32x4_t b00  = vld1q_f32(mtx_b0);
-            const float32x4_t b10  = vld1q_f32(mtx_b1);
-            const float32x4_t b20  = vld1q_f32(mtx_b2);
-            const float32x4_t b30  = vld1q_f32(mtx_b3);
+            float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);
+            float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);
+            float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);
+            float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);
+
+            float32x4_t b00 = vld1q_f32(mtx_b0);
+            float32x4_t b10 = vld1q_f32(mtx_b1);
+            float32x4_t b01 = vld1q_f32(mtx_b0 + 4);
+            float32x4_t b11 = vld1q_f32(mtx_b1 + 4);
+
+#if __arm__
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif
 
             // 4x4 block 0
-            acc00 = vmlaq_lane_f32(acc00, b00, a00l, 0);
-            acc10 = vmlaq_lane_f32(acc10, b00, a00l, 1);
-            acc20 = vmlaq_lane_f32(acc20, b00, a00h, 0);
-            acc30 = vmlaq_lane_f32(acc30, b00, a00h, 1);
+            acc00 = vmlaq_f32(acc00, b00, a0);
+            acc10 = vmlaq_f32(acc10, b00, a1);
+            acc20 = vmlaq_f32(acc20, b00, a2);
+            acc30 = vmlaq_f32(acc30, b00, a3);
+
+            float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4);
+            float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5);
+            float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6);
+            float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7);
 
             // 4x4 block 1
-            acc01 = vmlaq_lane_f32(acc01, b10, a00l, 0);
-            acc11 = vmlaq_lane_f32(acc11, b10, a00l, 1);
-            acc21 = vmlaq_lane_f32(acc21, b10, a00h, 0);
-            acc31 = vmlaq_lane_f32(acc31, b10, a00h, 1);
+            acc01 = vmlaq_f32(acc01, b10, a0);
+            acc11 = vmlaq_f32(acc11, b10, a1);
+            acc21 = vmlaq_f32(acc21, b10, a2);
+            acc31 = vmlaq_f32(acc31, b10, a3);
 
-            // 4x4 block 2
-            acc02 = vmlaq_lane_f32(acc02, b20, a00l, 0);
-            acc12 = vmlaq_lane_f32(acc12, b20, a00l, 1);
-            acc22 = vmlaq_lane_f32(acc22, b20, a00h, 0);
-            acc32 = vmlaq_lane_f32(acc32, b20, a00h, 1);
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b01, a4);
+            acc10 = vmlaq_f32(acc10, b01, a5);
+            acc20 = vmlaq_f32(acc20, b01, a6);
+            acc30 = vmlaq_f32(acc30, b01, a7);
 
-            // 4x4 block 3
-            acc03 = vmlaq_lane_f32(acc03, b30, a00l, 0);
-            acc13 = vmlaq_lane_f32(acc13, b30, a00l, 1);
-            acc23 = vmlaq_lane_f32(acc23, b30, a00h, 0);
-            acc33 = vmlaq_lane_f32(acc33, b30, a00h, 1);
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b11, a4);
+            acc11 = vmlaq_f32(acc11, b11, a5);
+            acc21 = vmlaq_f32(acc21, b11, a6);
+            acc31 = vmlaq_f32(acc31, b11, a7);
+
+            mtx_a0 += 8;
+            mtx_b0 += 8;
+            mtx_b1 += 8;
+
+            a0 = vld1q_dup_f32(mtx_a0 + 0);
+            a1 = vld1q_dup_f32(mtx_a0 + 1);
+            a2 = vld1q_dup_f32(mtx_a0 + 2);
+            a3 = vld1q_dup_f32(mtx_a0 + 3);
+
+            b00 = vld1q_f32(mtx_b0);
+            b10 = vld1q_f32(mtx_b1);
+            b01 = vld1q_f32(mtx_b0 + 4);
+            b11 = vld1q_f32(mtx_b1 + 4);
+
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b00, a0);
+            acc10 = vmlaq_f32(acc10, b00, a1);
+            acc20 = vmlaq_f32(acc20, b00, a2);
+            acc30 = vmlaq_f32(acc30, b00, a3);
+
+            a4 = vld1q_dup_f32(mtx_a0 + 4);
+            a5 = vld1q_dup_f32(mtx_a0 + 5);
+            a6 = vld1q_dup_f32(mtx_a0 + 6);
+            a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b10, a0);
+            acc11 = vmlaq_f32(acc11, b10, a1);
+            acc21 = vmlaq_f32(acc21, b10, a2);
+            acc31 = vmlaq_f32(acc31, b10, a3);
+
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b01, a4);
+            acc10 = vmlaq_f32(acc10, b01, a5);
+            acc20 = vmlaq_f32(acc20, b01, a6);
+            acc30 = vmlaq_f32(acc30, b01, a7);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b11, a4);
+            acc11 = vmlaq_f32(acc11, b11, a5);
+            acc21 = vmlaq_f32(acc21, b11, a6);
+            acc31 = vmlaq_f32(acc31, b11, a7);
+
+            mtx_a0 += 8;
+            mtx_b0 += 8;
+            mtx_b1 += 8;
+
+            a0  = vld1q_dup_f32(mtx_a0 + 0);
+            a1  = vld1q_dup_f32(mtx_a0 + 1);
+            a2  = vld1q_dup_f32(mtx_a0 + 2);
+            a3  = vld1q_dup_f32(mtx_a0 + 3);
+            b00 = vld1q_f32(mtx_b0);
+            b10 = vld1q_f32(mtx_b1);
+            b01 = vld1q_f32(mtx_b0 + 4);
+            b11 = vld1q_f32(mtx_b1 + 4);
+
+#if __arm__
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif
+
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b00, a0);
+            acc10 = vmlaq_f32(acc10, b00, a1);
+            acc20 = vmlaq_f32(acc20, b00, a2);
+            acc30 = vmlaq_f32(acc30, b00, a3);
+
+            a4 = vld1q_dup_f32(mtx_a0 + 4);
+            a5 = vld1q_dup_f32(mtx_a0 + 5);
+            a6 = vld1q_dup_f32(mtx_a0 + 6);
+            a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b10, a0);
+            acc11 = vmlaq_f32(acc11, b10, a1);
+            acc21 = vmlaq_f32(acc21, b10, a2);
+            acc31 = vmlaq_f32(acc31, b10, a3);
+
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b01, a4);
+            acc10 = vmlaq_f32(acc10, b01, a5);
+            acc20 = vmlaq_f32(acc20, b01, a6);
+            acc30 = vmlaq_f32(acc30, b01, a7);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b11, a4);
+            acc11 = vmlaq_f32(acc11, b11, a5);
+            acc21 = vmlaq_f32(acc21, b11, a6);
+            acc31 = vmlaq_f32(acc31, b11, a7);
+
+            mtx_a0 += 8;
+            mtx_b0 += 8;
+            mtx_b1 += 8;
+
+            a0  = vld1q_dup_f32(mtx_a0 + 0);
+            a1  = vld1q_dup_f32(mtx_a0 + 1);
+            a2  = vld1q_dup_f32(mtx_a0 + 2);
+            a3  = vld1q_dup_f32(mtx_a0 + 3);
+            b00 = vld1q_f32(mtx_b0);
+            b10 = vld1q_f32(mtx_b1);
+            b01 = vld1q_f32(mtx_b0 + 4);
+            b11 = vld1q_f32(mtx_b1 + 4);
+
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b00, a0);
+            acc10 = vmlaq_f32(acc10, b00, a1);
+            acc20 = vmlaq_f32(acc20, b00, a2);
+            acc30 = vmlaq_f32(acc30, b00, a3);
+
+            a4 = vld1q_dup_f32(mtx_a0 + 4);
+            a5 = vld1q_dup_f32(mtx_a0 + 5);
+            a6 = vld1q_dup_f32(mtx_a0 + 6);
+            a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b10, a0);
+            acc11 = vmlaq_f32(acc11, b10, a1);
+            acc21 = vmlaq_f32(acc21, b10, a2);
+            acc31 = vmlaq_f32(acc31, b10, a3);
+
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b01, a4);
+            acc10 = vmlaq_f32(acc10, b01, a5);
+            acc20 = vmlaq_f32(acc20, b01, a6);
+            acc30 = vmlaq_f32(acc30, b01, a7);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b11, a4);
+            acc11 = vmlaq_f32(acc11, b11, a5);
+            acc21 = vmlaq_f32(acc21, b11, a6);
+            acc31 = vmlaq_f32(acc31, b11, a7);
+
+            mtx_a0 += 8;
+            mtx_b0 += 8;
+            mtx_b1 += 8;
+        }
+
+        for(; mtx_b0 < mtx_b0_end_addr;)
+        {
+            float32x4_t a0  = vld1q_dup_f32(mtx_a0 + 0);
+            float32x4_t a1  = vld1q_dup_f32(mtx_a0 + 1);
+            float32x4_t a2  = vld1q_dup_f32(mtx_a0 + 2);
+            float32x4_t a3  = vld1q_dup_f32(mtx_a0 + 3);
+            float32x4_t b00 = vld1q_f32(mtx_b0);
+            float32x4_t b10 = vld1q_f32(mtx_b1);
+
+#if __arm__
+            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b00, a0);
+            acc10 = vmlaq_f32(acc10, b00, a1);
+            acc20 = vmlaq_f32(acc20, b00, a2);
+            acc30 = vmlaq_f32(acc30, b00, a3);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b10, a0);
+            acc11 = vmlaq_f32(acc11, b10, a1);
+            acc21 = vmlaq_f32(acc21, b10, a2);
+            acc31 = vmlaq_f32(acc31, b10, a3);
 
             mtx_a0 += 4;
             mtx_b0 += 4;
             mtx_b1 += 4;
-            mtx_b2 += 4;
-            mtx_b3 += 4;
         }
 
         // Multiply by the weight of matrix product (alpha)
@@ -294,38 +617,20 @@
             acc11                       = vmulq_f32(acc11, alpha_f32);
             acc21                       = vmulq_f32(acc21, alpha_f32);
             acc31                       = vmulq_f32(acc31, alpha_f32);
-            acc02                       = vmulq_f32(acc02, alpha_f32);
-            acc12                       = vmulq_f32(acc12, alpha_f32);
-            acc22                       = vmulq_f32(acc22, alpha_f32);
-            acc32                       = vmulq_f32(acc32, alpha_f32);
-            acc03                       = vmulq_f32(acc03, alpha_f32);
-            acc13                       = vmulq_f32(acc13, alpha_f32);
-            acc23                       = vmulq_f32(acc23, alpha_f32);
-            acc33                       = vmulq_f32(acc33, alpha_f32);
         }
 
         const auto mtx_out0 = reinterpret_cast<float *>(out.ptr());
         const auto mtx_out1 = mtx_out0 + 4;
-        const auto mtx_out2 = mtx_out1 + 4;
-        const auto mtx_out3 = mtx_out2 + 4;
 
         // Store the 4 blocks
         vst1q_f32(mtx_out0, acc00);
         vst1q_f32(mtx_out1, acc01);
-        vst1q_f32(mtx_out2, acc02);
-        vst1q_f32(mtx_out3, acc03);
         vst1q_f32(mtx_out0 + out_stride1, acc10);
         vst1q_f32(mtx_out1 + out_stride1, acc11);
-        vst1q_f32(mtx_out2 + out_stride1, acc12);
-        vst1q_f32(mtx_out3 + out_stride1, acc13);
         vst1q_f32(mtx_out0 + out_stride2, acc20);
         vst1q_f32(mtx_out1 + out_stride2, acc21);
-        vst1q_f32(mtx_out2 + out_stride2, acc22);
-        vst1q_f32(mtx_out3 + out_stride2, acc23);
         vst1q_f32(mtx_out0 + out_stride3, acc30);
         vst1q_f32(mtx_out1 + out_stride3, acc31);
-        vst1q_f32(mtx_out2 + out_stride3, acc32);
-        vst1q_f32(mtx_out3 + out_stride3, acc33);
     },
     ina, inb, out);
 }
@@ -453,6 +758,240 @@
     ARM_COMPUTE_ERROR("Not implemented");
 #endif
 }
+
+template <bool multiply_alpha>
+void matrix_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
+{
+    const size_t    in_b_stride          = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
+    const size_t    out_stride1          = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());
+    const size_t    out_stride2          = out_stride1 * 2;
+    const size_t    out_stride3          = out_stride1 * 3;
+    const int       num_elems_matrix_b_x = input1->info()->dimension(0);
+    const int       fixed_point_position = input0->info()->fixed_point_position();
+    const qint8x8_t alpha_qs8            = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position));
+    ARM_COMPUTE_UNUSED(alpha_qs8);
+
+    // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));
+
+    Window win_b;
+    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+    if(input1->info()->num_dimensions() >= 3)
+    {
+        win_b = window;
+    }
+    // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the cols of the output matrix
+    // The step along the x direction is 2 times the in_b_stride because for each iteration we compute 2 blocks of size 16x4
+    win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, 2 * in_b_stride));
+    win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator ina(input0, win_a);
+    Iterator inb(input1, win_b);
+    Iterator out(output, window);
+
+    // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
+    // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
+    // All the values needed for computing a single 32x4 block will be read from consecutive memory positions
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        auto mtx_a0 = reinterpret_cast<const qint8_t *>(ina.ptr());
+        auto mtx_b0 = reinterpret_cast<const qint8_t *>(inb.ptr());
+        auto mtx_b1 = mtx_b0 + in_b_stride;
+
+        qint16x8_t acc00_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc10_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc20_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc30_qs16 = vdupq_n_qs16(0);
+
+        qint16x8_t acc01_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc11_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc21_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc31_qs16 = vdupq_n_qs16(0);
+
+        qint16x8_t acc02_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc12_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc22_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc32_qs16 = vdupq_n_qs16(0);
+
+        qint16x8_t acc03_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc13_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc23_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc33_qs16 = vdupq_n_qs16(0);
+
+        int k = 0;
+        // This for loop performs 2 accumulations
+        for(; k <= (num_elems_matrix_b_x - 32); k += 32)
+        {
+            const qint8x8_t a0 = vld1_dup_qs8(mtx_a0 + 0);
+            const qint8x8_t a1 = vld1_dup_qs8(mtx_a0 + 1);
+            const qint8x8_t a2 = vld1_dup_qs8(mtx_a0 + 2);
+            const qint8x8_t a3 = vld1_dup_qs8(mtx_a0 + 3);
+            const qint8x8_t a4 = vld1_dup_qs8(mtx_a0 + 4);
+            const qint8x8_t a5 = vld1_dup_qs8(mtx_a0 + 5);
+            const qint8x8_t a6 = vld1_dup_qs8(mtx_a0 + 6);
+            const qint8x8_t a7 = vld1_dup_qs8(mtx_a0 + 7);
+
+            const qint8x8_t b00 = vld1_qs8(mtx_b0 + 0);
+            const qint8x8_t b01 = vld1_qs8(mtx_b0 + 8);
+            const qint8x8_t b10 = vld1_qs8(mtx_b1 + 0);
+            const qint8x8_t b11 = vld1_qs8(mtx_b1 + 8);
+
+            // First accumulation
+            acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
+            acc10_qs16 = vqmlal_qs8(acc10_qs16, b00, a1, fixed_point_position);
+            acc20_qs16 = vqmlal_qs8(acc20_qs16, b00, a2, fixed_point_position);
+            acc30_qs16 = vqmlal_qs8(acc30_qs16, b00, a3, fixed_point_position);
+            acc02_qs16 = vqmlal_qs8(acc02_qs16, b10, a0, fixed_point_position);
+            acc12_qs16 = vqmlal_qs8(acc12_qs16, b10, a1, fixed_point_position);
+            acc22_qs16 = vqmlal_qs8(acc22_qs16, b10, a2, fixed_point_position);
+            acc32_qs16 = vqmlal_qs8(acc32_qs16, b10, a3, fixed_point_position);
+
+            const qint8x8_t b02 = vld1_qs8(mtx_b0 + 16);
+            const qint8x8_t b03 = vld1_qs8(mtx_b0 + 24);
+            const qint8x8_t b12 = vld1_qs8(mtx_b1 + 16);
+            const qint8x8_t b13 = vld1_qs8(mtx_b1 + 24);
+
+            acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
+            acc11_qs16 = vqmlal_qs8(acc11_qs16, b01, a1, fixed_point_position);
+            acc21_qs16 = vqmlal_qs8(acc21_qs16, b01, a2, fixed_point_position);
+            acc31_qs16 = vqmlal_qs8(acc31_qs16, b01, a3, fixed_point_position);
+            acc03_qs16 = vqmlal_qs8(acc03_qs16, b11, a0, fixed_point_position);
+            acc13_qs16 = vqmlal_qs8(acc13_qs16, b11, a1, fixed_point_position);
+            acc23_qs16 = vqmlal_qs8(acc23_qs16, b11, a2, fixed_point_position);
+            acc33_qs16 = vqmlal_qs8(acc33_qs16, b11, a3, fixed_point_position);
+
+#if __arm__
+            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif
+
+            // Second accumulation
+            acc00_qs16 = vqmlal_qs8(acc00_qs16, b02, a4, fixed_point_position);
+            acc10_qs16 = vqmlal_qs8(acc10_qs16, b02, a5, fixed_point_position);
+            acc20_qs16 = vqmlal_qs8(acc20_qs16, b02, a6, fixed_point_position);
+            acc30_qs16 = vqmlal_qs8(acc30_qs16, b02, a7, fixed_point_position);
+            acc01_qs16 = vqmlal_qs8(acc01_qs16, b03, a4, fixed_point_position);
+            acc11_qs16 = vqmlal_qs8(acc11_qs16, b03, a5, fixed_point_position);
+            acc21_qs16 = vqmlal_qs8(acc21_qs16, b03, a6, fixed_point_position);
+            acc31_qs16 = vqmlal_qs8(acc31_qs16, b03, a7, fixed_point_position);
+            acc02_qs16 = vqmlal_qs8(acc02_qs16, b12, a4, fixed_point_position);
+            acc12_qs16 = vqmlal_qs8(acc12_qs16, b12, a5, fixed_point_position);
+            acc22_qs16 = vqmlal_qs8(acc22_qs16, b12, a6, fixed_point_position);
+            acc32_qs16 = vqmlal_qs8(acc32_qs16, b12, a7, fixed_point_position);
+            acc03_qs16 = vqmlal_qs8(acc03_qs16, b13, a4, fixed_point_position);
+            acc13_qs16 = vqmlal_qs8(acc13_qs16, b13, a5, fixed_point_position);
+            acc23_qs16 = vqmlal_qs8(acc23_qs16, b13, a6, fixed_point_position);
+            acc33_qs16 = vqmlal_qs8(acc33_qs16, b13, a7, fixed_point_position);
+
+            mtx_a0 += 8;
+            mtx_b0 += 32;
+            mtx_b1 += 32;
+        }
+
+        // This for loop performs the left over accumulations
+        for(; k < num_elems_matrix_b_x; k += 16)
+        {
+            const qint8x8_t a0 = vld1_dup_qs8(mtx_a0 + 0);
+            const qint8x8_t a1 = vld1_dup_qs8(mtx_a0 + 1);
+            const qint8x8_t a2 = vld1_dup_qs8(mtx_a0 + 2);
+            const qint8x8_t a3 = vld1_dup_qs8(mtx_a0 + 3);
+
+            const qint8x8_t b00 = vld1_qs8(mtx_b0 + 0);
+            const qint8x8_t b01 = vld1_qs8(mtx_b0 + 8);
+            const qint8x8_t b10 = vld1_qs8(mtx_b1 + 0);
+            const qint8x8_t b11 = vld1_qs8(mtx_b1 + 8);
+
+            acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
+            acc10_qs16 = vqmlal_qs8(acc10_qs16, b00, a1, fixed_point_position);
+            acc20_qs16 = vqmlal_qs8(acc20_qs16, b00, a2, fixed_point_position);
+            acc30_qs16 = vqmlal_qs8(acc30_qs16, b00, a3, fixed_point_position);
+            acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
+            acc11_qs16 = vqmlal_qs8(acc11_qs16, b01, a1, fixed_point_position);
+            acc21_qs16 = vqmlal_qs8(acc21_qs16, b01, a2, fixed_point_position);
+            acc31_qs16 = vqmlal_qs8(acc31_qs16, b01, a3, fixed_point_position);
+            acc02_qs16 = vqmlal_qs8(acc02_qs16, b10, a0, fixed_point_position);
+            acc12_qs16 = vqmlal_qs8(acc12_qs16, b10, a1, fixed_point_position);
+            acc22_qs16 = vqmlal_qs8(acc22_qs16, b10, a2, fixed_point_position);
+            acc32_qs16 = vqmlal_qs8(acc32_qs16, b10, a3, fixed_point_position);
+            acc03_qs16 = vqmlal_qs8(acc03_qs16, b11, a0, fixed_point_position);
+            acc13_qs16 = vqmlal_qs8(acc13_qs16, b11, a1, fixed_point_position);
+            acc23_qs16 = vqmlal_qs8(acc23_qs16, b11, a2, fixed_point_position);
+            acc33_qs16 = vqmlal_qs8(acc33_qs16, b11, a3, fixed_point_position);
+
+            mtx_a0 += 4;
+            mtx_b0 += 16;
+            mtx_b1 += 16;
+        }
+
+        // Convert back to qint8x8_t and saturate
+        qint8x8_t acc00_qs8 = vqmovn_qs16(acc00_qs16);
+        qint8x8_t acc10_qs8 = vqmovn_qs16(acc10_qs16);
+        qint8x8_t acc20_qs8 = vqmovn_qs16(acc20_qs16);
+        qint8x8_t acc30_qs8 = vqmovn_qs16(acc30_qs16);
+
+        qint8x8_t acc01_qs8 = vqmovn_qs16(acc01_qs16);
+        qint8x8_t acc11_qs8 = vqmovn_qs16(acc11_qs16);
+        qint8x8_t acc21_qs8 = vqmovn_qs16(acc21_qs16);
+        qint8x8_t acc31_qs8 = vqmovn_qs16(acc31_qs16);
+
+        qint8x8_t acc02_qs8 = vqmovn_qs16(acc02_qs16);
+        qint8x8_t acc12_qs8 = vqmovn_qs16(acc12_qs16);
+        qint8x8_t acc22_qs8 = vqmovn_qs16(acc22_qs16);
+        qint8x8_t acc32_qs8 = vqmovn_qs16(acc32_qs16);
+
+        qint8x8_t acc03_qs8 = vqmovn_qs16(acc03_qs16);
+        qint8x8_t acc13_qs8 = vqmovn_qs16(acc13_qs16);
+        qint8x8_t acc23_qs8 = vqmovn_qs16(acc23_qs16);
+        qint8x8_t acc33_qs8 = vqmovn_qs16(acc33_qs16);
+
+        // Multiply by the weight of the matrix product (alpha)
+        if(multiply_alpha)
+        {
+            acc00_qs8 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position);
+            acc10_qs8 = vqmul_qs8(acc10_qs8, alpha_qs8, fixed_point_position);
+            acc20_qs8 = vqmul_qs8(acc20_qs8, alpha_qs8, fixed_point_position);
+            acc30_qs8 = vqmul_qs8(acc30_qs8, alpha_qs8, fixed_point_position);
+            acc01_qs8 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position);
+            acc11_qs8 = vqmul_qs8(acc11_qs8, alpha_qs8, fixed_point_position);
+            acc21_qs8 = vqmul_qs8(acc21_qs8, alpha_qs8, fixed_point_position);
+            acc31_qs8 = vqmul_qs8(acc31_qs8, alpha_qs8, fixed_point_position);
+            acc02_qs8 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position);
+            acc12_qs8 = vqmul_qs8(acc12_qs8, alpha_qs8, fixed_point_position);
+            acc22_qs8 = vqmul_qs8(acc22_qs8, alpha_qs8, fixed_point_position);
+            acc32_qs8 = vqmul_qs8(acc32_qs8, alpha_qs8, fixed_point_position);
+            acc03_qs8 = vqmul_qs8(acc03_qs8, alpha_qs8, fixed_point_position);
+            acc13_qs8 = vqmul_qs8(acc13_qs8, alpha_qs8, fixed_point_position);
+            acc23_qs8 = vqmul_qs8(acc23_qs8, alpha_qs8, fixed_point_position);
+            acc33_qs8 = vqmul_qs8(acc33_qs8, alpha_qs8, fixed_point_position);
+        }
+
+        const auto mtx_out0 = reinterpret_cast<qint8_t *>(out.ptr());
+
+        // Store 32x4 output elements
+        vst1_qs8(mtx_out0 + 0, acc00_qs8);
+        vst1_qs8(mtx_out0 + 8, acc01_qs8);
+        vst1_qs8(mtx_out0 + 16, acc02_qs8);
+        vst1_qs8(mtx_out0 + 24, acc03_qs8);
+        vst1_qs8(mtx_out0 + out_stride1 + 0, acc10_qs8);
+        vst1_qs8(mtx_out0 + out_stride1 + 8, acc11_qs8);
+        vst1_qs8(mtx_out0 + out_stride1 + 16, acc12_qs8);
+        vst1_qs8(mtx_out0 + out_stride1 + 24, acc13_qs8);
+        vst1_qs8(mtx_out0 + out_stride2 + 0, acc20_qs8);
+        vst1_qs8(mtx_out0 + out_stride2 + 8, acc21_qs8);
+        vst1_qs8(mtx_out0 + out_stride2 + 16, acc22_qs8);
+        vst1_qs8(mtx_out0 + out_stride2 + 24, acc23_qs8);
+        vst1_qs8(mtx_out0 + out_stride3 + 0, acc30_qs8);
+        vst1_qs8(mtx_out0 + out_stride3 + 8, acc31_qs8);
+        vst1_qs8(mtx_out0 + out_stride3 + 16, acc32_qs8);
+        vst1_qs8(mtx_out0 + out_stride3 + 24, acc33_qs8);
+    },
+    ina, inb, out);
+}
+
 } // namespace
 
 NEGEMMMatrixMultiplyKernel::NEGEMMMatrixMultiplyKernel()
@@ -462,10 +1001,13 @@
 
 void NEGEMMMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, DataType::QS8);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+
     if(output->info()->dimension(1) == 1)
     {
         ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
@@ -479,10 +1021,27 @@
     unsigned int       num_elems_processed_per_iteration_x = 0;
     const unsigned int num_elems_processed_per_iteration_y = 4;
 
-    // Check if the output tensor is a vector and the data type is F32. If so,the kernel runs the vector-matrix multiplication
-    if((output->info()->dimension(1) == 1) && (input0->info()->data_type() == DataType::F32))
+    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
+    if((output->info()->dimension(1) == 1))
     {
-        num_elems_processed_per_iteration_x = 16;
+        switch(input0->info()->data_type())
+        {
+            case DataType::F32:
+            {
+                num_elems_processed_per_iteration_x = 16;
+                break;
+            }
+            case DataType::QS8:
+            {
+                num_elems_processed_per_iteration_x = 32;
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Data type not supported");
+                break;
+            }
+        }
 
         // Configure kernel window
         Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
@@ -494,7 +1053,9 @@
                                   AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration_x),
                                   output_access);
 
-        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+        Coordinates coord;
+        coord.set_num_dimensions(output->info()->num_dimensions());
+        output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
 
         INEKernel::configure(win);
     }
@@ -502,16 +1063,23 @@
     {
         switch(input0->info()->data_type())
         {
-            case DataType::F16:
+            case DataType::F32:
             {
                 num_elems_processed_per_iteration_x = 8;
                 break;
             }
-            case DataType::F32:
+            case DataType::QS8:
             {
-                num_elems_processed_per_iteration_x = 16;
+                num_elems_processed_per_iteration_x = 32;
                 break;
             }
+            case DataType::F16:
+            {
+#ifdef ARM_COMPUTE_ENABLE_FP16
+                num_elems_processed_per_iteration_x = 8;
+                break;
+#endif
+            }
             default:
             {
                 ARM_COMPUTE_ERROR("Data type not supported");
@@ -543,45 +1111,53 @@
     bool multiply_alpha = std::abs(1.0f - _alpha) > 0.00001f;
 
     // Check if the output tensor is a vector and the data type is F32. If so,the kernel runs the vector-matrix multiplication
-    if((_output->info()->dimension(1) == 1) && (_input0->info()->data_type() == DataType::F32))
+    if((_output->info()->dimension(1) == 1))
     {
-        if(multiply_alpha)
+        switch(_input0->info()->data_type())
         {
-            vector_matrix_multiply_f32<true>(_input0, _input1, _output, window, _alpha);
-        }
-        else
-        {
-            vector_matrix_multiply_f32<false>(_input0, _input1, _output, window, _alpha);
+            case DataType::F32:
+            {
+                multiply_alpha ? vector_matrix_multiply_f32<true>(_input0, _input1, _output, window, _alpha) :
+                vector_matrix_multiply_f32<false>(_input0, _input1, _output, window, _alpha);
+                break;
+            }
+            case DataType::QS8:
+            {
+                multiply_alpha ? vector_matrix_multiply_qs8<true>(_input0, _input1, _output, window, _alpha) :
+                vector_matrix_multiply_qs8<false>(_input0, _input1, _output, window, _alpha);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Data type not supported");
+                break;
+            }
         }
     }
     else
     {
         switch(_input0->info()->data_type())
         {
-            case DataType::F16:
-            {
-                if(multiply_alpha)
-                {
-                    matrix_matrix_multiply_f16<true>(_input0, _input1, _output, window, _alpha);
-                }
-                else
-                {
-                    matrix_matrix_multiply_f16<false>(_input0, _input1, _output, window, _alpha);
-                }
-                break;
-            }
             case DataType::F32:
             {
-                if(multiply_alpha)
-                {
-                    matrix_matrix_multiply_f32<true>(_input0, _input1, _output, window, _alpha);
-                }
-                else
-                {
-                    matrix_matrix_multiply_f32<false>(_input0, _input1, _output, window, _alpha);
-                }
+                multiply_alpha ? matrix_matrix_multiply_f32<true>(_input0, _input1, _output, window, _alpha) :
+                matrix_matrix_multiply_f32<false>(_input0, _input1, _output, window, _alpha);
                 break;
             }
+            case DataType::QS8:
+            {
+                multiply_alpha ? matrix_matrix_multiply_qs8<true>(_input0, _input1, _output, window, _alpha) :
+                matrix_matrix_multiply_qs8<false>(_input0, _input1, _output, window, _alpha);
+                break;
+            }
+            case DataType::F16:
+            {
+#ifdef ARM_COMPUTE_ENABLE_FP16
+                multiply_alpha ? matrix_matrix_multiply_f16<true>(_input0, _input1, _output, window, _alpha) :
+                matrix_matrix_multiply_f16<false>(_input0, _input1, _output, window, _alpha);
+                break;
+#endif
+            }
             default:
             {
                 ARM_COMPUTE_ERROR("Data type not supported");
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index 1dbc07a..ccf5cb4 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
@@ -42,33 +43,22 @@
 
 void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    TensorShape  output_shape{ input->info()->tensor_shape() };
+    const size_t transpose_w = 16 / input->info()->element_size();
+    output_shape.set(0, input->info()->dimension(1) * transpose_w);
+    output_shape.set(1, static_cast<size_t>(std::ceil((input->info()->dimension(0) / static_cast<float>(transpose_w)))));
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) != std::ceil(input->info()->dimension(0) / 8.0f)) && (input->info()->data_type() == DataType::F16));
-    ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) != std::ceil(input->info()->dimension(0) / 4.0f)) && (input->info()->data_type() == DataType::F32));
-    ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) != std::ceil(input->info()->dimension(0) / 4.0f)) && (input->info()->data_type() == DataType::U8));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
 
-    unsigned int num_elems_processed_per_iteration = 0;
-    float        scale_x                           = 1.f;
-
-    switch(input->info()->data_type())
-    {
-        case DataType::F32:
-        case DataType::U8:
-            num_elems_processed_per_iteration = 4;
-            scale_x                           = 4.f;
-            break;
-        case DataType::F16:
-#ifdef ARM_COMPUTE_ENABLE_FP16
-            num_elems_processed_per_iteration = 8;
-            scale_x                           = 8.f;
-            break;
-#endif
-        default:
-            ARM_COMPUTE_ERROR("Data type not supported");
-            break;
-    }
+    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+    const float        scale_x                           = num_elems_processed_per_iteration;
 
     _input  = input;
     _output = output;
@@ -99,11 +89,10 @@
      *         |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 |
      *         |a30 a31 a32 a33|
      *
-     * If the input data type is F32, the output matrix will have the following shape: [ height * 4, width / 4 ]
-     * If the input data type is F16, the output matrix will have the following shape: [ height * 8, width / 8 ]
+     * The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
      */
 
-    /* Set window for output tensor. Set to 0 the X and Y dimensions in order to allow multi-threading implementation and future batched matrix multiplications. */
+    // Set window for output tensor. Set to 0 the X and Y dimensions in order to allow multi-threading implementation and future batched matrix multiplications
     Window win_out(window);
     win_out.set(Window::DimX, Window::Dimension(0, 0, 0));
     win_out.set(Window::DimY, Window::Dimension(0, 0, 0));
@@ -111,56 +100,50 @@
     Iterator in(_input, window);
     Iterator out(_output, win_out);
 
-    switch(_input->info()->data_type())
+    switch(_input->info()->element_size())
     {
-        case DataType::F32:
+        case 1:
+        {
+            const size_t out_stride = _output->info()->strides_in_bytes()[1];
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                // Output address = base addr + (y * 16) + (x / 16 ) * stride
+                const uint8_t *in_ptr  = in.ptr();
+                uint8_t *const out_ptr = out.ptr() + (id.y() << 4) + (id.x() >> 4) * out_stride;
+                vst1q_u8(out_ptr, vld1q_u8(in_ptr));
+            },
+            in, out);
+            break;
+        }
+        case 2:
+        {
+            const size_t out_stride = _output->info()->strides_in_bytes()[1] / sizeof(int16_t);
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                // Output address = base addr + (y * 8) + (x / 8 ) * stride
+                const auto in_ptr  = reinterpret_cast<const uint16_t *>(in.ptr());
+                const auto out_ptr = reinterpret_cast<uint16_t *>(out.ptr()) + (id.y() << 3) + (id.x() >> 3) * out_stride;
+                vst1q_u16(out_ptr, vld1q_u16(in_ptr));
+            },
+            in, out);
+            break;
+        }
+        case 4:
         {
             const size_t out_stride = _output->info()->strides_in_bytes()[1] / sizeof(float);
-
             execute_window_loop(window, [&](const Coordinates & id)
             {
-                const auto        in_ptr = reinterpret_cast<const float *>(in.ptr());
-                const float32x4_t data   = vld1q_f32(in_ptr);
-                /* Output address = base addr + (y * 4) + (x / 4 ) * stride */
-                const auto out_ptr = reinterpret_cast<float *>(out.ptr()) + (id.y() << 2) + (id.x() >> 2) * out_stride;
-                vst1q_f32(out_ptr, data);
+                // Output address = base addr + (y * 4) + (x / 4 ) * stride
+                const auto in_ptr  = reinterpret_cast<const uint32_t *>(in.ptr());
+                const auto out_ptr = reinterpret_cast<uint32_t *>(out.ptr()) + (id.y() << 2) + (id.x() >> 2) * out_stride;
+                vst1q_u32(out_ptr, vld1q_u32(in_ptr));
             },
             in, out);
             break;
         }
-        case DataType::U8:
-        {
-            const size_t out_stride = _output->info()->strides_in_bytes()[1] / sizeof(uint8_t);
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto in_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
-                /* Output address = base addr + (y * 4) + (x / 4 ) * stride */
-                const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr()) + (id.y() << 2) + (id.x() >> 2) * out_stride;
-                std::copy_n(in_ptr, 4, out_ptr);
-            },
-            in, out);
-            break;
-        }
-
-        case DataType::F16:
-#ifdef ARM_COMPUTE_ENABLE_FP16
-            {
-                const size_t out_stride = _output->info()->strides_in_bytes()[1] / sizeof(float16_t);
-
-                execute_window_loop(window, [&](const Coordinates & id)
-                {
-                    const auto in_ptr = reinterpret_cast<const float16_t *>(in.ptr());
-                    // Output address = base addr + (y * 8) + (x / 8 ) * stride
-                    float16_t *out_ptr = reinterpret_cast<float16_t *>(out.ptr()) + (id.y() << 3) + (id.x() >> 3) * out_stride;
-                    vst1q_f16(out_ptr, vld1q_f16(in_ptr));
-                },
-                in, out);
-                break;
-            }
-#endif
         default:
         {
-            ARM_COMPUTE_ERROR("Data type not supported");
+            ARM_COMPUTE_ERROR("Element size not supported");
             break;
         }
     }
diff --git a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
index 7bc4534..404ad8a 100644
--- a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
+++ b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
@@ -46,7 +46,6 @@
     static const int32x4_t   zero_s32     = vdupq_n_s32(0);
     static const int32x4_t   one_s32      = vdupq_n_s32(1);
     const int32x4_t          num_bins_s32 = vdupq_n_s32(num_bins);
-    const int32x4_t          max_hidx_s32 = vdupq_n_s32(num_bins - 1);
 
     memset(output_ptr, 0, sizeof(float) * num_bins);
 
@@ -70,14 +69,10 @@
             // Compute histogram index.
             int32x4_t hidx_s32 = vcvtq_s32_f32(phase_f32);
 
-            // Check if the histogram index is equal to num_bins. If so, replace the index with max_hidx
-            uint32x4_t mask = vceqq_s32(hidx_s32, num_bins_s32);
-            hidx_s32        = vbslq_s32(mask, max_hidx_s32, hidx_s32);
-
             // Compute magnitude weights (w0 and w1)
             const float32x4_t hidx_f32 = vcvtq_f32_s32(hidx_s32);
 
-            // w1 = phase_f32 - hidx_s32
+            // w1 = phase_f32 - hidx_f32
             const float32x4_t w1_f32 = vsubq_f32(phase_f32, hidx_f32);
 
             // w0 = 1.0 - w1
@@ -89,6 +84,10 @@
 
             // Weighted vote between 2 bins
 
+            // Check if the histogram index is equal to num_bins. If so, replace the index with 0
+            uint32x4_t mask = vceqq_s32(hidx_s32, num_bins_s32);
+            hidx_s32        = vbslq_s32(mask, zero_s32, hidx_s32);
+
             // Bin 0
             *(output_ptr + vgetq_lane_s32(hidx_s32, 0)) += vgetq_lane_f32(mag_w0_f32, 0);
             *(output_ptr + vgetq_lane_s32(hidx_s32, 1)) += vgetq_lane_f32(mag_w0_f32, 1);
@@ -108,11 +107,12 @@
             *(output_ptr + vgetq_lane_s32(hidx_s32, 3)) += vgetq_lane_f32(mag_w1_f32, 3);
         }
 
-        for(; xc < static_cast<int32_t>(cell_width); xc++)
+        for(; xc < static_cast<int32_t>(cell_width); ++xc)
         {
             const float phase_value = *(phase_row_ptr + xc + yc * phase_stride) * phase_scale + 0.5f;
             const float mag_value   = *(mag_row_ptr + xc + yc * mag_stride);
-            const float w1          = phase_value - std::floor(phase_value);
+
+            const float w1 = phase_value - std::floor(phase_value);
 
             // The quantised phase is the histogram index [0, num_bins - 1] - Round
             // Check limit of histogram index. If hidx == num_bins, hidx = 0
@@ -120,7 +120,7 @@
 
             // Weighted vote between 2 bins
             *(output_ptr + hidx) += mag_value * (1.0f - w1);
-            *(output_ptr + ((hidx + 1) % num_bins)) += mag_value * w1;
+            *(output_ptr + ((hidx + 1) % (num_bins))) += mag_value * w1;
         }
     }
 }
@@ -329,6 +329,7 @@
             vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]);
         }
 
+        // Compute left over
         for(; xc < static_cast<int32_t>(num_bins_block_x); xc++)
         {
             const float input_value = hist_ptr[xc];
@@ -416,7 +417,8 @@
             vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]);
         }
 
-        for(; xc < static_cast<int32_t>(num_bins_block_x); xc++)
+        // Compute left over
+        for(; xc < static_cast<int32_t>(num_bins_block_x); ++xc)
         {
             const float input_value = hist_ptr[xc];
 
@@ -431,9 +433,9 @@
     sum += vgetq_lane_f32(sum_f32, 2);
     sum += vgetq_lane_f32(sum_f32, 3);
 
-    float             scale           = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f);
-    float32x4_t       scale_f32       = vdupq_n_f32(scale);
-    const float32x4_t thres_l2hys_f32 = vdupq_n_f32(l2_hyst_threshold);
+    float             scale                 = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f);
+    float32x4_t       scale_f32             = vdupq_n_f32(scale);
+    const float32x4_t l2_hyst_threshold_f32 = vdupq_n_f32(l2_hyst_threshold);
 
     // Reset sum
     sum_f32 = vdupq_n_f32(0.0f);
@@ -460,10 +462,10 @@
         input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
 
         // Clip input_value if over _threshold_l2hys
-        input_value.val[0] = vminq_f32(input_value.val[0], thres_l2hys_f32);
-        input_value.val[1] = vminq_f32(input_value.val[1], thres_l2hys_f32);
-        input_value.val[2] = vminq_f32(input_value.val[2], thres_l2hys_f32);
-        input_value.val[3] = vminq_f32(input_value.val[3], thres_l2hys_f32);
+        input_value.val[0] = vminq_f32(input_value.val[0], l2_hyst_threshold_f32);
+        input_value.val[1] = vminq_f32(input_value.val[1], l2_hyst_threshold_f32);
+        input_value.val[2] = vminq_f32(input_value.val[2], l2_hyst_threshold_f32);
+        input_value.val[3] = vminq_f32(input_value.val[3], l2_hyst_threshold_f32);
 
         // Compute input_value^2
         sum_f32 = vmlaq_f32(sum_f32, input_value.val[0], input_value.val[0]);
@@ -742,7 +744,6 @@
         case HOGNormType::L1_NORM:
             _func = &l1_norm;
             break;
-        case HOGNormType::L1SQRT_NORM:
         default:
             ARM_COMPUTE_ERROR_ON("Normalisation type not supported");
             break;
@@ -773,11 +774,11 @@
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    // Get number of element per block
-    const size_t num_elem_block = _output->info()->num_channels();
+    // Get number of bins per block
+    const size_t num_bins_per_block = _output->info()->num_channels();
 
     // Number of bins on the same row of the block
-    const int32_t num_bins_block_x = _num_cells_per_block.width * _num_bins;
+    const int32_t num_bins_per_block_x = _num_cells_per_block.width * _num_bins;
 
     const size_t input_stride = _input->info()->strides_in_bytes()[Window::DimY] / data_size_from_type(_input->info()->data_type());
 
@@ -795,7 +796,7 @@
         const auto out_row_ptr   = reinterpret_cast<float *>(out.ptr());
 
         // Execute normalization function
-        (*_func)(input_row_ptr, out_row_ptr, input_stride, _num_cells_per_block.height, num_bins_block_x, num_elem_block, _l2_hyst_threshold);
+        (*_func)(input_row_ptr, out_row_ptr, input_stride, _num_cells_per_block.height, num_bins_per_block_x, num_bins_per_block, _l2_hyst_threshold);
     },
     in, out);
 }
diff --git a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
index c71d427..4af22bc 100644
--- a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
+++ b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
@@ -35,13 +35,13 @@
 
 NEHOGDetectorKernel::NEHOGDetectorKernel()
     : _input(nullptr), _detection_windows(), _hog_descriptor(nullptr), _bias(0.0f), _threshold(0.0f), _idx_class(0), _num_bins_per_descriptor_x(0), _num_blocks_per_descriptor_y(0), _block_stride_width(0),
-      _block_stride_height(0), _detection_window_width(0), _detection_window_height(0), _mutex()
+      _block_stride_height(0), _detection_window_width(0), _detection_window_height(0), _max_num_detection_windows(0), _mutex()
 {
 }
 
 void NEHOGDetectorKernel::configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, uint16_t idx_class)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32);
     ARM_COMPUTE_ERROR_ON(hog == nullptr);
     ARM_COMPUTE_ERROR_ON(detection_windows == nullptr);
     ARM_COMPUTE_ERROR_ON((detection_window_stride.width % hog->info()->block_stride().width) != 0);
@@ -63,6 +63,9 @@
     _block_stride_height         = block_stride.height;
     _detection_window_width      = detection_window_size.width;
     _detection_window_height     = detection_window_size.height;
+    _max_num_detection_windows   = detection_windows->max_num_values();
+
+    ARM_COMPUTE_ERROR_ON((_num_bins_per_descriptor_x * _num_blocks_per_descriptor_y + 1) != hog->info()->descriptor_size());
 
     // Get the number of blocks along the x and y directions of the input tensor
     const ValidRegion &valid_region = input->info()->valid_region();
@@ -81,8 +84,8 @@
     win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x), window_step_x));
     win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y), window_step_y));
 
-    const unsigned int num_elems_read_per_iteration = _num_bins_per_descriptor_x;
-    const unsigned int num_rows_read_per_iteration  = _num_blocks_per_descriptor_y;
+    constexpr unsigned int num_elems_read_per_iteration = 1;
+    const unsigned int     num_rows_read_per_iteration  = _num_blocks_per_descriptor_y;
 
     update_window_and_padding(win, AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration));
 
@@ -163,17 +166,20 @@
 
         if(score > _threshold)
         {
-            DetectionWindow win;
-            win.x         = (id.x() * _block_stride_width);
-            win.y         = (id.y() * _block_stride_height);
-            win.width     = _detection_window_width;
-            win.height    = _detection_window_height;
-            win.idx_class = _idx_class;
-            win.score     = score;
+            if(_detection_windows->num_values() < _max_num_detection_windows)
+            {
+                DetectionWindow win;
+                win.x         = (id.x() * _block_stride_width);
+                win.y         = (id.y() * _block_stride_height);
+                win.width     = _detection_window_width;
+                win.height    = _detection_window_height;
+                win.idx_class = _idx_class;
+                win.score     = score;
 
-            std::unique_lock<std::mutex> lock(_mutex);
-            _detection_windows->push_back(win);
-            lock.unlock();
+                std::unique_lock<std::mutex> lock(_mutex);
+                _detection_windows->push_back(win);
+                lock.unlock();
+            }
         }
     },
     in);
diff --git a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
index 56797f0..585676b 100644
--- a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
@@ -51,39 +51,84 @@
 {
     static const float16x8_t zero = vdupq_n_f16(0.f);
 
-    /* Trace^2 */
+    // Trace^2
     float16x8_t trace2 = vaddq_f16(gx2, gy2);
     trace2             = vmulq_f16(trace2, trace2);
 
-    /* Det(A) */
+    // Det(A)
     float16x8_t det = vmulq_f16(gx2, gy2);
     det             = vfmsq_f16(det, gxgy, gxgy);
 
-    /* Det(A) - sensitivity * trace^2 */
+    // Det(A) - sensitivity * trace^2
     const float16x8_t mc = vfmsq_f16(det, vdupq_n_f16(sensitivity), trace2);
 
-    /* mc > strength_thresh */
+    // mc > strength_thresh
     const uint16x8_t mask = vcgtq_f16(mc, vdupq_n_f16(strength_thresh));
 
     return vbslq_f16(mask, mc, zero);
 }
 
 template <size_t block_size>
-inline void harris_score_FLOAT_FLOAT_FLOAT(float16x8_t low_gx, float16x8_t low_gy, float16x8_t high_gx, float16x8_t high_gy, float16x8_t &gx2, float16x8_t &gy2, float16x8_t &gxgy,
-                                           float norm_factor)
+inline void harris_score1xN_FLOAT_FLOAT_FLOAT(float16x8_t low_gx, float16x8_t low_gy, float16x8_t high_gx, float16x8_t high_gy, float16x8_t &gx2, float16x8_t &gy2, float16x8_t &gxgy,
+                                              float norm_factor)
 {
     const float16x8_t norm_factor_fp16 = vdupq_n_f16(norm_factor);
 
-    /* Normalize */
+    // Normalize
     low_gx  = vmulq_f16(low_gx, norm_factor_fp16);
     low_gy  = vmulq_f16(low_gy, norm_factor_fp16);
     high_gx = vmulq_f16(high_gx, norm_factor_fp16);
     high_gy = vmulq_f16(high_gy, norm_factor_fp16);
 
-    for(size_t i = 0; i < block_size; ++i)
+    float16x8_t gx = vextq_f16(low_gx, high_gx, 0);
+    float16x8_t gy = vextq_f16(low_gy, high_gy, 0);
+
+    gx2  = vfmaq_f16(gx2, gx, gx);
+    gy2  = vfmaq_f16(gy2, gy, gy);
+    gxgy = vfmaq_f16(gxgy, gx, gy);
+
+    gx = vextq_f16(low_gx, high_gx, 1);
+    gy = vextq_f16(low_gy, high_gy, 1);
+
+    gx2  = vfmaq_f16(gx2, gx, gx);
+    gy2  = vfmaq_f16(gy2, gy, gy);
+    gxgy = vfmaq_f16(gxgy, gx, gy);
+
+    gx = vextq_f16(low_gx, high_gx, 2);
+    gy = vextq_f16(low_gy, high_gy, 2);
+
+    gx2  = vfmaq_f16(gx2, gx, gx);
+    gy2  = vfmaq_f16(gy2, gy, gy);
+    gxgy = vfmaq_f16(gxgy, gx, gy);
+
+    if(block_size > 3)
     {
-        const float16x8_t gx = vextq_f16(low_gx, high_gx, i);
-        const float16x8_t gy = vextq_f16(low_gy, high_gy, i);
+        gx = vextq_f16(low_gx, high_gx, 3);
+        gy = vextq_f16(low_gy, high_gy, 3);
+
+        gx2  = vfmaq_f16(gx2, gx, gx);
+        gy2  = vfmaq_f16(gy2, gy, gy);
+        gxgy = vfmaq_f16(gxgy, gx, gy);
+
+        gx = vextq_f16(low_gx, high_gx, 4);
+        gy = vextq_f16(low_gy, high_gy, 4);
+
+        gx2  = vfmaq_f16(gx2, gx, gx);
+        gy2  = vfmaq_f16(gy2, gy, gy);
+        gxgy = vfmaq_f16(gxgy, gx, gy);
+    }
+
+    if(block_size == 7)
+    {
+        gx = vextq_f16(low_gx, high_gx, 5);
+        gy = vextq_f16(low_gy, high_gy, 5);
+
+        gx2  = vfmaq_f16(gx2, gx, gx);
+        gy2  = vfmaq_f16(gy2, gy, gy);
+        gxgy = vfmaq_f16(gxgy, gx, gy);
+
+        gx = vextq_f16(low_gx, high_gx, 6);
+        gy = vextq_f16(low_gy, high_gy, 6);
 
         gx2  = vfmaq_f16(gx2, gx, gx);
         gy2  = vfmaq_f16(gy2, gy, gy);
@@ -101,7 +146,7 @@
     const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
     const auto     output   = static_cast<float *__restrict>(out_ptr);
 
-    /* Gx^2, Gy^2 and Gx*Gy */
+    // Gx^2, Gy^2 and Gx*Gy
     float16x8_t gx2  = vdupq_n_f16(0.0f);
     float16x8_t gy2  = vdupq_n_f16(0.0f);
     float16x8_t gxgy = vdupq_n_f16(0.0f);
@@ -112,19 +157,19 @@
         const float16x8_t high_gx = vcvtq_f16_s16(vld1q_s16(gx_ptr_1));
         const float16x8_t low_gy  = vcvtq_f16_s16(vld1q_s16(gy_ptr_0));
         const float16x8_t high_gy = vcvtq_f16_s16(vld1q_s16(gy_ptr_1));
-        harris_score_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
+        harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
 
-        /* Update gx and gy pointer */
+        // Update gx and gy pointer
         gx_ptr_0 += in_stride;
         gy_ptr_0 += in_stride;
         gx_ptr_1 += in_stride;
         gy_ptr_1 += in_stride;
     }
 
-    /* Calculate harris score */
+    // Calculate harris score
     const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
 
-    /* Store score */
+    // Store score
     vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
     vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
 }
@@ -143,7 +188,7 @@
     const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
     const auto     output   = static_cast<float *__restrict>(out_ptr);
 
-    /* Gx^2, Gy^2 and Gx*Gy */
+    // Gx^2, Gy^2 and Gx*Gy
     float16x8_t gx2  = zero;
     float16x8_t gy2  = zero;
     float16x8_t gxgy = zero;
@@ -158,9 +203,9 @@
                                                 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
         const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
                                                  vget_low_f16(zero));
-        harris_score_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
+        harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
 
-        /* Update gx and gy pointer */
+        // Update gx and gy pointer
         gx_ptr_0 += in_stride;
         gy_ptr_0 += in_stride;
         gx_ptr_1 += in_stride;
@@ -169,10 +214,10 @@
         gy_ptr_2 += in_stride;
     }
 
-    /* Calculate harris score */
+    // Calculate harris score
     const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
 
-    /* Store score */
+    // Store score
     vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
     vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
 }
@@ -193,7 +238,7 @@
     const int32_t *gy_ptr_3 = gy_ptr_0 + 12;
     const auto     output   = static_cast<float *__restrict>(out_ptr);
 
-    /* Gx^2, Gy^2 and Gx*Gy */
+    // Gx^2, Gy^2 and Gx*Gy
     float16x8_t gx2  = zero;
     float16x8_t gy2  = zero;
     float16x8_t gxgy = zero;
@@ -208,9 +253,9 @@
                                                 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
         const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
                                                  vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_3))));
-        harris_score_FLOAT_FLOAT_FLOAT<7>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
+        harris_score1xN_FLOAT_FLOAT_FLOAT<7>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
 
-        /* Update gx and gy pointer */
+        // Update gx and gy pointer
         gx_ptr_0 += in_stride;
         gy_ptr_0 += in_stride;
         gx_ptr_1 += in_stride;
@@ -219,10 +264,10 @@
         gy_ptr_2 += in_stride;
     }
 
-    /* Calculate harris score */
+    // Calculate harris score
     const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
 
-    /* Store score */
+    // Store score
     vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
     vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
 }
@@ -328,18 +373,18 @@
 {
 inline float32x4_t harris_score(float32x4_t gx2, float32x4_t gy2, float32x4_t gxgy, float32x4_t sensitivity, float32x4_t strength_thresh)
 {
-    /* Trace^2 */
+    // Trace^2
     float32x4_t trace2 = vaddq_f32(gx2, gy2);
     trace2             = vmulq_f32(trace2, trace2);
 
-    /* Det(A) */
+    // Det(A)
     float32x4_t det = vmulq_f32(gx2, gy2);
     det             = vmlsq_f32(det, gxgy, gxgy);
 
-    /* Det(A) - sensitivity * trace^2 */
+    // Det(A) - sensitivity * trace^2
     const float32x4_t mc = vmlsq_f32(det, sensitivity, trace2);
 
-    /* mc > strength_thresh */
+    // mc > strength_thresh
     const uint32x4_t mask = vcgtq_f32(mc, strength_thresh);
 
     return vbslq_f32(mask, mc, vdupq_n_f32(0.0f));
@@ -348,7 +393,7 @@
 inline void harris_score1x3_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
                                               float32x4_t norm_factor)
 {
-    /* Normalize */
+    // Normalize
     low_gx  = vmulq_f32(low_gx, norm_factor);
     low_gy  = vmulq_f32(low_gy, norm_factor);
     high_gx = vmulq_f32(high_gx, norm_factor);
@@ -361,17 +406,17 @@
     const float32x4_t r_gx = vextq_f32(low_gx, high_gx, 2);
     const float32x4_t r_gy = vextq_f32(low_gy, high_gy, 2);
 
-    /* Gx*Gx*/
+    // Gx*Gx
     gx2 = vmlaq_f32(gx2, l_gx, l_gx);
     gx2 = vmlaq_f32(gx2, m_gx, m_gx);
     gx2 = vmlaq_f32(gx2, r_gx, r_gx);
 
-    /* Gy*Gy*/
+    // Gy*Gy
     gy2 = vmlaq_f32(gy2, l_gy, l_gy);
     gy2 = vmlaq_f32(gy2, m_gy, m_gy);
     gy2 = vmlaq_f32(gy2, r_gy, r_gy);
 
-    /* Gx*Gy */
+    // Gx*Gy
     gxgy = vmlaq_f32(gxgy, l_gx, l_gy);
     gxgy = vmlaq_f32(gxgy, m_gx, m_gy);
     gxgy = vmlaq_f32(gxgy, r_gx, r_gy);
@@ -380,53 +425,53 @@
 inline void harris_score1x5_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
                                               float32x4_t norm_factor)
 {
-    /* Normalize */
+    // Normalize
     low_gx  = vmulq_f32(low_gx, norm_factor);
     low_gy  = vmulq_f32(low_gy, norm_factor);
     high_gx = vmulq_f32(high_gx, norm_factor);
     high_gy = vmulq_f32(high_gy, norm_factor);
 
-    /* L2 values  */
+    // L2 values
     float32x4_t gx = low_gx;
     float32x4_t gy = low_gy;
 
-    /* Accumulate */
+    // Accumulate
     gx2  = vmlaq_f32(gx2, gx, gx);
     gy2  = vmlaq_f32(gy2, gy, gy);
     gxgy = vmlaq_f32(gxgy, gx, gy);
 
-    /* L1 values  */
+    // L1 values
     gx = vextq_f32(low_gx, high_gx, 1);
     gy = vextq_f32(low_gy, high_gy, 1);
 
-    /* Accumulate */
+    // Accumulate
     gx2  = vmlaq_f32(gx2, gx, gx);
     gy2  = vmlaq_f32(gy2, gy, gy);
     gxgy = vmlaq_f32(gxgy, gx, gy);
 
-    /* M values  */
+    // M values
     gx = vextq_f32(low_gx, high_gx, 2);
     gy = vextq_f32(low_gy, high_gy, 2);
 
-    /* Accumulate */
+    // Accumulate
     gx2  = vmlaq_f32(gx2, gx, gx);
     gy2  = vmlaq_f32(gy2, gy, gy);
     gxgy = vmlaq_f32(gxgy, gx, gy);
 
-    /* R1 values  */
+    // R1 values
     gx = vextq_f32(low_gx, high_gx, 3);
     gy = vextq_f32(low_gy, high_gy, 3);
 
-    /* Accumulate */
+    // Accumulate
     gx2  = vmlaq_f32(gx2, gx, gx);
     gy2  = vmlaq_f32(gy2, gy, gy);
     gxgy = vmlaq_f32(gxgy, gx, gy);
 
-    /* R2 values  */
+    // R2 values
     gx = high_gx;
     gy = high_gy;
 
-    /* Accumulate */
+    // Accumulate
     gx2  = vmlaq_f32(gx2, gx, gx);
     gy2  = vmlaq_f32(gy2, gy, gy);
     gxgy = vmlaq_f32(gxgy, gx, gy);
@@ -435,81 +480,81 @@
 inline void harris_score1x7_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t high_gx1, float32x4_t high_gy1, float32x4_t &gx2,
                                               float32x4_t &gy2, float32x4_t &gxgy, float32x4_t norm_factor)
 {
-    /* Normalize */
+    // Normalize
     low_gx  = vmulq_f32(low_gx, norm_factor);
     low_gy  = vmulq_f32(low_gy, norm_factor);
     high_gx = vmulq_f32(high_gx, norm_factor);
     high_gy = vmulq_f32(high_gy, norm_factor);
 
-    /* L3 values  */
+    // L3 values
     float32x4_t gx = low_gx;
     float32x4_t gy = low_gy;
 
-    /* Accumulate */
+    // Accumulate
     gx2  = vmlaq_f32(gx2, gx, gx);
     gy2  = vmlaq_f32(gy2, gy, gy);
     gxgy = vmlaq_f32(gxgy, gx, gy);
 
-    /* L2 values  */
+    // L2 values
     gx = vextq_f32(low_gx, high_gx, 1);
     gy = vextq_f32(low_gy, high_gy, 1);
 
-    /* Accumulate */
+    // Accumulate
     gx2  = vmlaq_f32(gx2, gx, gx);
     gy2  = vmlaq_f32(gy2, gy, gy);
     gxgy = vmlaq_f32(gxgy, gx, gy);
 
-    /* L1 values  */
+    // L1 values
     gx = vextq_f32(low_gx, high_gx, 2);
     gy = vextq_f32(low_gy, high_gy, 2);
 
-    /* Accumulate */
+    // Accumulate
     gx2  = vmlaq_f32(gx2, gx, gx);
     gy2  = vmlaq_f32(gy2, gy, gy);
     gxgy = vmlaq_f32(gxgy, gx, gy);
 
-    /* M values  */
+    // M values
     gx = vextq_f32(low_gx, high_gx, 3);
     gy = vextq_f32(low_gy, high_gy, 3);
 
-    /* Accumulate */
+    // Accumulate
     gx2  = vmlaq_f32(gx2, gx, gx);
     gy2  = vmlaq_f32(gy2, gy, gy);
     gxgy = vmlaq_f32(gxgy, gx, gy);
 
-    /* R1 values  */
+    // R1 values
     gx = high_gx;
     gy = high_gy;
 
-    /* Accumulate */
+    // Accumulate
     gx2  = vmlaq_f32(gx2, gx, gx);
     gy2  = vmlaq_f32(gy2, gy, gy);
     gxgy = vmlaq_f32(gxgy, gx, gy);
 
-    /* Change tmp_low and tmp_high for calculating R2 and R3 values */
+    // Change tmp_low and tmp_high for calculating R2 and R3 values
     low_gx  = high_gx;
     low_gy  = high_gy;
     high_gx = high_gx1;
     high_gy = high_gy1;
 
-    /* Normalize */
+    // Normalize
     high_gx = vmulq_f32(high_gx, norm_factor);
     high_gy = vmulq_f32(high_gy, norm_factor);
 
-    /* R2 values  */
+    // R2 values
     gx = vextq_f32(low_gx, high_gx, 1);
     gy = vextq_f32(low_gy, high_gy, 1);
 
-    /* Accumulate */
+    // Accumulate
     gx2  = vmlaq_f32(gx2, gx, gx);
     gy2  = vmlaq_f32(gy2, gy, gy);
     gxgy = vmlaq_f32(gxgy, gx, gy);
 
-    /* R3 values  */
+    // R3 values
     gx = vextq_f32(low_gx, high_gx, 2);
     gy = vextq_f32(low_gy, high_gy, 2);
 
-    /* Accumulate */
+    // Accumulate
     gx2  = vmlaq_f32(gx2, gx, gx);
     gy2  = vmlaq_f32(gy2, gy, gy);
     gxgy = vmlaq_f32(gxgy, gx, gy);
@@ -525,7 +570,7 @@
     const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
     const auto     output   = static_cast<float *__restrict>(output_ptr);
 
-    /* Gx^2, Gy^2 and Gx*Gy */
+    // Gx^2, Gy^2 and Gx*Gy
     float32x4x2_t gx2 =
     {
         {
@@ -548,7 +593,7 @@
         }
     };
 
-    /* Row0 */
+    // Row0
     int16x8x2_t tmp_gx =
     {
         {
@@ -579,7 +624,7 @@
     high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
 
-    /* Row1 */
+    // Row1
     tmp_gx.val[0] = vld1q_s16(gx_ptr_0);
     tmp_gy.val[0] = vld1q_s16(gy_ptr_0);
     tmp_gx.val[1] = vld1q_s16(gx_ptr_1);
@@ -597,7 +642,7 @@
     high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
 
-    /* Row2 */
+    // Row2
     tmp_gx.val[0] = vld1q_s16(gx_ptr_0 + input_stride);
     tmp_gy.val[0] = vld1q_s16(gy_ptr_0 + input_stride);
     tmp_gx.val[1] = vld1q_s16(gx_ptr_1 + input_stride);
@@ -615,7 +660,7 @@
     high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
 
-    /* Calculate harris score */
+    // Calculate harris score
     const float32x4x2_t mc =
     {
         {
@@ -624,7 +669,7 @@
         }
     };
 
-    /* Store score */
+    // Store score
     vst1q_f32(output + 0, mc.val[0]);
     vst1q_f32(output + 4, mc.val[1]);
 }
@@ -643,7 +688,7 @@
     float32x4_t    norm_factor     = vdupq_n_f32(in_norm_factor);
     float32x4_t    strength_thresh = vdupq_n_f32(in_strength_thresh);
 
-    /* Gx^2, Gy^2 and Gx*Gy */
+    // Gx^2, Gy^2 and Gx*Gy
     float32x4x2_t gx2 =
     {
         {
@@ -666,7 +711,7 @@
         }
     };
 
-    /* Row0 */
+    // Row0
     float32x4_t low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 - input_stride));
     float32x4_t low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 - input_stride));
     float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
@@ -679,7 +724,7 @@
     high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 - input_stride));
     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
 
-    /* Row1 */
+    // Row1
     low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
     low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
     high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
@@ -692,7 +737,7 @@
     high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
 
-    /* Row2 */
+    // Row2
     low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 + input_stride));
     low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 + input_stride));
     high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
@@ -705,7 +750,7 @@
     high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 + input_stride));
     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
 
-    /* Calculate harris score */
+    // Calculate harris score
     const float32x4x2_t mc =
     {
         {
@@ -714,7 +759,7 @@
         }
     };
 
-    /* Store score */
+    // Store score
     vst1q_f32(output + 0, mc.val[0]);
     vst1q_f32(output + 4, mc.val[1]);
 }
@@ -728,7 +773,7 @@
     const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
     const auto     output   = static_cast<float *__restrict>(output_ptr);
 
-    /* Gx^2, Gy^2 and Gx*Gy */
+    // Gx^2, Gy^2 and Gx*Gy
     float32x4x2_t gx2 =
     {
         {
@@ -783,14 +828,14 @@
         high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
         harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
 
-        /* Update gx and gy pointer */
+        // Update gx and gy pointer
         gx_ptr_0 += input_stride;
         gy_ptr_0 += input_stride;
         gx_ptr_1 += input_stride;
         gy_ptr_1 += input_stride;
     }
 
-    /* Calculate harris score */
+    // Calculate harris score
     const float32x4x2_t mc =
     {
         {
@@ -799,7 +844,7 @@
         }
     };
 
-    /* Store score */
+    // Store score
     vst1q_f32(output + 0, mc.val[0]);
     vst1q_f32(output + 4, mc.val[1]);
 }
@@ -816,7 +861,7 @@
     const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
     const auto     output   = static_cast<float *__restrict>(output_ptr);
 
-    /* Gx^2, Gy^2 and Gx*Gy */
+    // Gx^2, Gy^2 and Gx*Gy
     float32x4x2_t gx2 =
     {
         {
@@ -856,7 +901,7 @@
         const float32x4_t high_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
         harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_1, low_gy_1, high_gx_1, high_gy_1, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
 
-        /* Update gx and gy pointer */
+        // Update gx and gy pointer
         gx_ptr_0 += input_stride;
         gy_ptr_0 += input_stride;
         gx_ptr_1 += input_stride;
@@ -865,7 +910,7 @@
         gy_ptr_2 += input_stride;
     }
 
-    /* Calculate harris score */
+    // Calculate harris score
     const float32x4x2_t mc =
     {
         {
@@ -874,7 +919,7 @@
         }
     };
 
-    /* Store score */
+    // Store score
     vst1q_f32(output + 0, mc.val[0]);
     vst1q_f32(output + 4, mc.val[1]);
 }
@@ -888,7 +933,7 @@
     const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
     const auto     output   = static_cast<float *__restrict>(output_ptr);
 
-    /* Gx^2, Gy^2 and Gx*Gy */
+    // Gx^2, Gy^2 and Gx*Gy
     float32x4_t gx2             = vdupq_n_f32(0.0f);
     float32x4_t gy2             = vdupq_n_f32(0.0f);
     float32x4_t gxgy            = vdupq_n_f32(0.0f);
@@ -911,17 +956,17 @@
         float32x4_t high_gy1 = vcvtq_f32_s32(vmovl_s16(tmp1_gy));
         harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
 
-        /* Update gx and gy pointer */
+        // Update gx and gy pointer
         gx_ptr_0 += input_stride;
         gy_ptr_0 += input_stride;
         gx_ptr_1 += input_stride;
         gy_ptr_1 += input_stride;
     }
 
-    /* Calculate harris score */
+    // Calculate harris score
     const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
 
-    /* Store score */
+    // Store score
     vst1q_f32(output, mc);
 }
 
@@ -936,7 +981,7 @@
     const int32_t *gy_ptr_2 = gy_ptr_1 + 4;
     const auto     output   = static_cast<float *__restrict>(output_ptr);
 
-    /* Gx^2, Gy^2 and Gx*Gy */
+    // Gx^2, Gy^2 and Gx*Gy
     float32x4_t gx2             = vdupq_n_f32(0.0f);
     float32x4_t gy2             = vdupq_n_f32(0.0f);
     float32x4_t gxgy            = vdupq_n_f32(0.0f);
@@ -954,7 +999,7 @@
         const float32x4_t high_gy1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
         harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
 
-        /* Update gx and gy pointer */
+        // Update gx and gy pointer
         gx_ptr_0 += input_stride;
         gy_ptr_0 += input_stride;
         gx_ptr_1 += input_stride;
@@ -963,10 +1008,10 @@
         gy_ptr_2 += input_stride;
     }
 
-    /* Calculate harris score */
+    // Calculate harris score
     const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
 
-    /* Store score */
+    // Store score
     vst1q_f32(output, mc);
 }
 
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 4406eac..c7c23d5 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
 
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/FixedPoint.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -38,6 +39,112 @@
 
 using namespace arm_compute;
 
+namespace
+{
+template <typename T, bool has_pads>
+inline void linearize_volume(const uint8_t *const in_ptr,
+                             T                   *out_ptr,
+                             bool                 has_bias,
+                             int                  top_left_x,
+                             int                  top_left_y,
+                             int                  kernel_size,
+                             int                  kernel_depth,
+                             int                  input_w,
+                             int                  input_h,
+                             int                  input_stride_x,
+                             int                  input_stride_y,
+                             int                  input_stride_z,
+                             int                  fixed_point_position)
+{
+    const int kernel_size2 = kernel_size * kernel_size;
+    const int x_e          = top_left_x + kernel_size;
+    const int y_e          = top_left_y + kernel_size;
+
+    // Linearize volume
+    int d = 0;
+    // This for loop linearize a volume with 3 slices. This allows:
+    // 1) to reduce the iterations of the outer for loop "d"
+    // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs
+    for(; d <= (kernel_depth - 3); d += 3)
+    {
+        for(int y = top_left_y; y < y_e; ++y)
+        {
+            if((y < 0 || y >= input_h) && has_pads)
+            {
+                // All the values will be zeros
+                for(int x = top_left_x; x < x_e; ++x, ++out_ptr)
+                {
+                    *(out_ptr + 0 * kernel_size2) = 0;
+                    *(out_ptr + 1 * kernel_size2) = 0;
+                    *(out_ptr + 2 * kernel_size2) = 0;
+                }
+            }
+            else
+            {
+                for(int x = top_left_x; x < x_e; ++x, ++out_ptr)
+                {
+                    if((x < 0 || x >= input_w) && has_pads)
+                    {
+                        *(out_ptr + 0 * kernel_size2) = 0;
+                        *(out_ptr + 1 * kernel_size2) = 0;
+                        *(out_ptr + 2 * kernel_size2) = 0;
+                    }
+                    else
+                    {
+                        *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                        *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                        *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                    }
+                }
+            }
+        }
+        out_ptr += 2 * kernel_size2;
+    }
+
+    // Left over
+    for(; d < kernel_depth; d++)
+    {
+        for(int y = top_left_y; y < y_e; ++y)
+        {
+            if((y < 0 || y >= input_h) && has_pads)
+            {
+                // All the values will be zeros
+                memset(out_ptr, 0, kernel_size * sizeof(T));
+                out_ptr += kernel_size;
+            }
+            else
+            {
+                for(int x = top_left_x; x < x_e; ++x, ++out_ptr)
+                {
+                    if((x < 0 || x >= input_w) && has_pads)
+                    {
+                        *out_ptr = 0;
+                    }
+                    else
+                    {
+                        *out_ptr = *(reinterpret_cast<const T *>(in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                    }
+                }
+            }
+        }
+    }
+
+    // Append 1 if the convolution layer has biases
+    if(has_bias)
+    {
+        if(std::is_same<T, arm_compute::qint8_t>::value)
+        {
+            *out_ptr = scvt_qs8_f32(1.0f, fixed_point_position);
+        }
+        else
+        {
+            *out_ptr = static_cast<T>(1);
+        }
+    }
+}
+} // namespace
+
+template <typename T, bool has_pads>
 void NEIm2ColKernel::run_generic(const Window &window)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -84,36 +191,27 @@
 
         // Get pointers
         const uint8_t *const input_ptr  = in.ptr();
-        auto                 output_ptr = reinterpret_cast<float *>(out.ptr());
+        auto                 output_ptr = reinterpret_cast<T *>(out.ptr());
 
         // Linearize volume
-        for(int d = 0; d < kernel_depth; ++d)
-        {
-            for(int y = top_left_y, y_e = top_left_y + static_cast<int>(_kernel_size); y < y_e; ++y)
-            {
-                for(int x = top_left_x, x_e = top_left_x + static_cast<int>(_kernel_size); x < x_e; ++x, ++output_ptr)
-                {
-                    if(x < 0 || x >= input_w || y < 0 || y >= input_h)
-                    {
-                        *output_ptr = 0.f;
-                    }
-                    else
-                    {
-                        *output_ptr = *(reinterpret_cast<const float *>(input_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x)));
-                    }
-                }
-            }
-        }
-
-        // Add bias
-        if(_has_bias)
-        {
-            *output_ptr = 1;
-        }
+        linearize_volume<T, has_pads>(input_ptr,
+                                      output_ptr,
+                                      _has_bias,
+                                      top_left_x,
+                                      top_left_y,
+                                      static_cast<int>(_kernel_size),
+                                      kernel_depth,
+                                      input_w,
+                                      input_h,
+                                      input_stride_x,
+                                      input_stride_y,
+                                      input_stride_z,
+                                      _input->info()->fixed_point_position());
     },
     in, out);
 }
 
+template <typename T>
 void NEIm2ColKernel::run_reduced(const Window &window)
 {
     const size_t in_width   = _input->info()->dimension(0);
@@ -148,7 +246,14 @@
         // Add bias
         if(_has_bias)
         {
-            *(reinterpret_cast<float *>(out_ptr) + out_width - 1) = 1.0f;
+            if(std::is_same<T, arm_compute::qint8_t>::value)
+            {
+                *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = scvt_qs8_f32(1.0f, _input->info()->fixed_point_position());
+            }
+            else
+            {
+                *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = static_cast<T>(1);
+            }
         }
     }
     while(in_window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
@@ -161,8 +266,9 @@
 
 void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, std::pair<unsigned int, unsigned int> convolved_dims, const PadStrideInfo &conv_info, bool has_bias)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
     _input          = input;
     _output         = output;
@@ -185,11 +291,33 @@
 
     if(run_img2col_reduced)
     {
-        _func = &NEIm2ColKernel::run_reduced;
+        switch(_input->info()->data_type())
+        {
+            case DataType::F32:
+                _func = &NEIm2ColKernel::run_reduced<float>;
+                break;
+            case DataType::QS8:
+                _func = &NEIm2ColKernel::run_reduced<qint8_t>;
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Data type not supported");
+                break;
+        }
     }
     else
     {
-        _func = &NEIm2ColKernel::run_generic;
+        switch(_input->info()->data_type())
+        {
+            case DataType::F32:
+                _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<float, false> : &NEIm2ColKernel::run_generic<float, true>;
+                break;
+            case DataType::QS8:
+                _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<qint8_t, false> : &NEIm2ColKernel::run_generic<qint8_t, true>;
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Data type not supported");
+                break;
+        }
         window.set(Window::DimX, Window::Dimension(0, _convolved_dims.first, 1));
         window.set(Window::DimY, Window::Dimension(0, _convolved_dims.second, 1));
         window.set(Window::DimZ, Window::Dimension(0, 1, 1));
diff --git a/src/core/NEON/kernels/NEIntegralImageKernel.cpp b/src/core/NEON/kernels/NEIntegralImageKernel.cpp
index 8651c83..3b09a1b 100644
--- a/src/core/NEON/kernels/NEIntegralImageKernel.cpp
+++ b/src/core/NEON/kernels/NEIntegralImageKernel.cpp
@@ -40,9 +40,25 @@
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
 
+    _input  = input;
+    _output = output;
+
     constexpr unsigned int num_elems_processed_per_iteration = 16;
 
-    INESimpleKernel::configure(input, output, num_elems_processed_per_iteration);
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    // The kernel is effectively reading 17 values from -1 as it loads 16
+    // starting at -1 and also 16 starting at 0
+    AccessWindowRectangle  output_read_access(output->info(), -1, -1, num_elems_processed_per_iteration + 1, 1);
+    AccessWindowHorizontal output_write_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                              output_read_access, output_write_access);
+
+    output_write_access.set_valid_region(win, input->info()->valid_region());
+
+    IKernel::configure(win);
 }
 
 BorderSize NEIntegralImageKernel::border_size() const
diff --git a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000..ab84efb
--- /dev/null
+++ b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window)
+{
+    const auto width_matrix_b  = static_cast<int>(output->info()->dimension(0));
+    const auto in_b_stride     = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
+    const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
+
+    // The implementation computes 16 elements per iteration
+    const int window_start_x = 16 * window.thread_id();
+    const int window_step_x  = 16 * window.num_threads();
+    // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+    const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+
+    Window win_out(window);
+    win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator ina(input0, win_a);
+    Iterator out(output, win_out);
+
+    execute_window_loop(win_out, [&](const Coordinates & id)
+    {
+        if(id.x() > width_matrix_b)
+        {
+            return;
+        }
+
+        float32x4_t acc0 = vdupq_n_f32(0.f);
+        float32x4_t acc1 = vdupq_n_f32(0.f);
+        float32x4_t acc2 = vdupq_n_f32(0.f);
+        float32x4_t acc3 = vdupq_n_f32(0.f);
+
+        auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
+        auto matrix_b = reinterpret_cast<const float *>(input1->ptr_to_element(Coordinates(id[0], 0, id[1])));
+
+#if __arm__
+        asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+        asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
+        asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
+#endif
+
+        const float *vec_a_end_addr = vec_a + num_elems_vec_a;
+
+        for(; vec_a <= (vec_a_end_addr - 4);)
+        {
+            float32x2_t a0l = vld1_f32(vec_a);
+
+            float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+            float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+            float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+            float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+            float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+            float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+            float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+            float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+
+#if __arm__
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
+#endif
+
+            acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+            acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+            acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+            acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+
+            acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+            acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+            acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+            acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+
+            vec_a += 2;
+            matrix_b += 2 * in_b_stride;
+
+            a0l = vld1_f32(vec_a);
+
+            b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+            b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+            b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+            b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+            b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+            b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+            b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+            b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+
+            acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+            acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+            acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+            acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+
+            acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+            acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+            acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+            acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+
+            vec_a += 2;
+            matrix_b += 2 * in_b_stride;
+        }
+
+        for(; vec_a < vec_a_end_addr;)
+        {
+            const float a0 = *vec_a;
+
+            const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+            const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+            const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+            const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+            acc0 = vmlaq_n_f32(acc0, b00, a0);
+            acc1 = vmlaq_n_f32(acc1, b01, a0);
+            acc2 = vmlaq_n_f32(acc2, b02, a0);
+            acc3 = vmlaq_n_f32(acc3, b03, a0);
+
+            vec_a += 1;
+            matrix_b += in_b_stride;
+        }
+
+        const auto vec_out = reinterpret_cast<float *>(out.ptr());
+
+        vst1q_f32(vec_out + 0, acc0);
+        vst1q_f32(vec_out + 4, acc1);
+        vst1q_f32(vec_out + 8, acc2);
+        vst1q_f32(vec_out + 12, acc3);
+    },
+    ina, out);
+}
+} // namespace
+
+NELocallyConnectedMatrixMultiplyKernel::NELocallyConnectedMatrixMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr)
+{
+}
+
+void NELocallyConnectedMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+
+    _input0 = input0;
+    _input1 = input1;
+    _output = output;
+
+    unsigned int num_elems_processed_per_iteration_x = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
+
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration_x);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input0->info(), 0, num_elems_processed_per_iteration_x),
+                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration_x),
+                              output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NELocallyConnectedMatrixMultiplyKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    vector_matrix_multiply_f32(_input0, _input1, _output, window);
+}
diff --git a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
index 576bf5c..b188614 100644
--- a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
@@ -36,8 +36,8 @@
 #include <climits>
 #include <cstddef>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 NEMinMaxKernel::NEMinMaxKernel()
     : _func(), _input(nullptr), _min(), _max(), _min_init(), _max_init(), _mtx()
 {
@@ -190,7 +190,6 @@
     return false;
 }
 
-#ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
 template <unsigned int...>
 struct index_seq
 {
@@ -210,10 +209,7 @@
 {
     using type = index_seq<S...>;
 };
-#endif /* DOXYGEN_SKIP_THIS */
 
-namespace arm_compute
-{
 template <class T, unsigned int... N>
 struct NEMinMaxLocationKernel::create_func_table<T, index_seq<N...>>
 {
@@ -225,7 +221,6 @@
 {
     &NEMinMaxLocationKernel::minmax_loc<T, bool(N & 8), bool(N & 4), bool(N & 2), bool(N & 1)>...
 };
-} // namespace arm_compute
 
 void NEMinMaxLocationKernel::configure(const IImage *input, int32_t *min, int32_t *max,
                                        ICoordinates2DArray *min_loc, ICoordinates2DArray *max_loc,
@@ -363,3 +358,4 @@
         }
     }
 }
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
index 4ca5dca..03d1409 100644
--- a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
+++ b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
@@ -36,8 +36,8 @@
 #include <tuple>
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 const uint8x16_t zero_u8 = vdupq_n_u8(0);
@@ -419,9 +419,6 @@
     }
 }
 
-#ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
-namespace arm_compute
-{
 template <>
 void NENonLinearFilterKernel::median_filter_box<3, 3>(const Window &win)
 {
@@ -507,8 +504,6 @@
     },
     input, output);
 }
-} // namespace arm_compute
-#endif
 
 template <int mask_w, int mask_h>
 void NENonLinearFilterKernel::min_filter_box(const Window &win)
@@ -588,9 +583,6 @@
     input, output);
 }
 
-#ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
-namespace arm_compute
-{
 template <>
 void NENonLinearFilterKernel::median_filter_cross<3, 3>(const Window &win)
 {
@@ -656,8 +648,6 @@
     },
     input, output);
 }
-} // namespace arm_compute
-#endif
 
 template <int mask_w, int mask_h>
 void NENonLinearFilterKernel::min_filter_cross(const Window &win)
@@ -751,9 +741,6 @@
     input, output);
 }
 
-#ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
-namespace arm_compute
-{
 template <>
 void NENonLinearFilterKernel::median_filter_disk<5, 5>(const Window &win)
 {
@@ -878,8 +865,6 @@
     },
     input, output);
 }
-} // namespace arm_compute
-#endif
 
 template <int mask_w, int mask_h>
 void NENonLinearFilterKernel::non_linear_filter_generic(const Window &win)
@@ -1021,3 +1006,4 @@
             break;
     }
 }
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index 90ce0e5..a971dc8 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -23,7 +23,9 @@
  */
 #include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h"
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
@@ -33,7 +35,7 @@
 using namespace arm_compute;
 
 NENormalizationLayerKernel::NENormalizationLayerKernel()
-    : _func(nullptr), _input(nullptr), _input_squared(nullptr), _output(nullptr), _norm_info(NormType::IN_MAP), _border_size()
+    : _func(nullptr), _input(nullptr), _input_squared(nullptr), _output(nullptr), _norm_info(NormType::IN_MAP_1D), _border_size()
 {
 }
 
@@ -44,28 +46,51 @@
 
 void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, input_squared);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, input_squared, output);
     ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
+    ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.beta(), input);
+    ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.kappa(), input);
+    ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.scale_coeff(), input);
 
-    const unsigned int border_width = (norm_info.type() == NormType::IN_MAP) ? 3 : 0;
+    const unsigned int border_width = (norm_info.type() == NormType::CROSS_MAP) ? 0 : std::min(norm_info.norm_size() / 2, 3U);
 
     _input         = input;
     _input_squared = input_squared;
     _output        = output;
     _norm_info     = norm_info;
-    _func          = (norm_info.type() == NormType::IN_MAP) ? &NENormalizationLayerKernel::normalize<0> : &NENormalizationLayerKernel::normalize<2>;
     _border_size   = BorderSize(0, border_width);
 
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
-    const unsigned int     num_elems_read_per_iteration      = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
+    const bool is_dt_f32 = _input->info()->data_type() == DataType::F32;
+
+    switch(norm_info.type())
+    {
+        case NormType::IN_MAP_1D:
+            _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<0, false> : &NENormalizationLayerKernel::normalize_fixed_point<0, false>;
+            break;
+        case NormType::IN_MAP_2D:
+            // Normalize over X and Y
+            _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<0, true> : &NENormalizationLayerKernel::normalize_fixed_point<0, true>;
+            break;
+        case NormType::CROSS_MAP:
+            _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<2, false> : &NENormalizationLayerKernel::normalize_fixed_point<2, false>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT SUPPORTED!");
+    }
+
+    const unsigned int num_elems_processed_per_iteration = (is_dt_f32) ? 4 : 16;
+    const unsigned int num_elems_read_per_iteration      = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
+    const unsigned int num_rows                          = (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1;
 
     // Configure window
     Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
 
-    AccessWindowHorizontal input_access(input->info(), -_border_size.left, num_elems_read_per_iteration);
-    AccessWindowHorizontal input_squared_access(input_squared->info(), -_border_size.left, num_elems_read_per_iteration);
+    AccessWindowRectangle  input_access(input->info(), -_border_size.left, 0, num_elems_read_per_iteration, num_rows);
+    AccessWindowRectangle  input_squared_access(input_squared->info(), -_border_size.left, 0, num_elems_read_per_iteration, num_rows);
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
     update_window_and_padding(win, input_access, input_squared_access, output_access);
@@ -75,19 +100,22 @@
     INEKernel::configure(win);
 }
 
-template <unsigned int dim>
+template <unsigned int dim, bool do_2D_norm>
 void NENormalizationLayerKernel::normalize(const Window &window)
 {
     Iterator input(_input, window);
     Iterator input_squared(_input_squared, window);
     Iterator output(_output, window);
 
+    const int dim_y                = 1;
     const int radius               = _norm_info.norm_size() / 2;
     const int total_size           = _input->info()->dimension(dim) - 1;
     const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim];
-    // We account padding when we normalize across X
-    const int min_left  = (dim == 0) ? -static_cast<int>(border_size().left) : 0;
-    const int max_right = (dim == 0) ? total_size + border_size().left : total_size;
+    // We account padding across X only and we iterate over rows
+    const int min_left   = (dim == 2) ? 0 : -static_cast<int>(border_size().left);
+    const int max_right  = (dim == 2) ? total_size : total_size + border_size().left;
+    const int min_top    = 0;
+    const int max_bottom = _input->info()->dimension(dim_y) - 1;
 
     const float32x4_t coeff_vec = vdupq_n_f32(_norm_info.scale_coeff());
     const float32x4_t beta_vec  = vdupq_n_f32(_norm_info.beta());
@@ -96,25 +124,89 @@
     execute_window_loop(window, [&](const Coordinates & id)
     {
         // Get range to normalize
+        const int current_row   = do_2D_norm ? id[dim_y] : 0;
         const int current_slice = id[dim];
+        const int first_row     = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+        const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
         const int first_slice   = std::max(current_slice - radius, min_left);
         const int last_slice    = std::min(current_slice + radius, max_right);
 
-        // Accumulate cross map values
+        // Accumulate 2D In-Map values
         float32x4_t accu = vdupq_n_f32(0.f);
-        for(int i = first_slice; i <= last_slice; ++i)
+        for(int j = first_row; j <= last_row; j++)
         {
-            accu = vaddq_f32(accu, vld1q_f32(reinterpret_cast<float *>(input_squared.ptr() + (i - current_slice) * input_squared_stride)));
+            // Compute row displacement
+            const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+            const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+            for(int i = first_slice; i <= last_slice; ++i)
+            {
+                accu = vaddq_f32(accu, vld1q_f32(reinterpret_cast<const float *>(input_squared_ptr + i * input_squared_stride)));
+            }
         }
 
         // Normalize
         const float32x4_t normalized       = vpowq_f32(vmlaq_f32(kappa_vec, coeff_vec, accu), beta_vec);
-        const float32x4_t normalized_pixel = vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr())), vinvq_f32(normalized));
+        const float32x4_t normalized_pixel = vmulq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), vinvq_f32(normalized));
         vst1q_f32(reinterpret_cast<float *>(output.ptr()), normalized_pixel);
     },
     input, input_squared, output);
 }
 
+template <unsigned int dim, bool do_2D_norm>
+void NENormalizationLayerKernel::normalize_fixed_point(const Window &window)
+{
+    Iterator input(_input, window);
+    Iterator input_squared(_input_squared, window);
+    Iterator output(_output, window);
+
+    const int dim_y                = 1;
+    const int radius               = _norm_info.norm_size() / 2;
+    const int total_size           = _input->info()->dimension(dim) - 1;
+    const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim];
+    // We account padding across X only and we iterate over rows
+    const int min_left   = (dim == 2) ? 0 : -static_cast<int>(border_size().left);
+    const int max_right  = (dim == 2) ? total_size : total_size + border_size().left;
+    const int min_top    = 0;
+    const int max_bottom = _input->info()->dimension(dim_y) - 1;
+
+    const int fixed_point_position = _input->info()->fixed_point_position();
+
+    const qint8x16_t coeff_vec = vdupq_n_qs8_f32(_norm_info.scale_coeff(), fixed_point_position);
+    const qint8x16_t beta_vec  = vdupq_n_qs8_f32(_norm_info.beta(), fixed_point_position);
+    const qint8x16_t kappa_vec = vdupq_n_qs8_f32(_norm_info.kappa(), fixed_point_position);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Get range to normalize
+        const int current_row   = do_2D_norm ? id[dim_y] : 0;
+        const int current_slice = id[dim];
+        const int first_row     = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+        const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+        const int first_slice   = std::max(current_slice - radius, min_left);
+        const int last_slice    = std::min(current_slice + radius, max_right);
+
+        // Accumulate 2D In-Map values
+        qint8x16_t accu = vdupq_n_qs8(0);
+        for(int j = first_row; j <= last_row; ++j)
+        {
+            // Compute row displacement
+            const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+            const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+            for(int i = first_slice; i <= last_slice; ++i)
+            {
+                accu = vqaddq_qs8(accu, vld1q_qs8(reinterpret_cast<const qint8_t *>(input_squared_ptr + i * input_squared_stride)));
+            }
+        }
+
+        // Normalize
+        const qint8x16_t accu_scale       = vqmlaq_qs8(kappa_vec, coeff_vec, accu, fixed_point_position);
+        const qint8x16_t normalized       = vqpowq_qs8(accu_scale, beta_vec, fixed_point_position);
+        const qint8x16_t normalized_pixel = vdivq_qs8(vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr())), normalized, fixed_point_position);
+        vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), normalized_pixel);
+    },
+    input, input_squared, output);
+}
+
 void NENormalizationLayerKernel::run(const Window &window)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
index 6b7510f..aa8c7a1 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
@@ -61,7 +62,6 @@
 {
     // Scale
     const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);
-
     // Round to nearest (round half up)
     // Add +0.5 for all values
     // Afterwards vcvt rounds toward zero
@@ -125,6 +125,25 @@
 }
 
 template <bool is_scale255, bool is_sat>
+void mul_QS8_QS8_QS8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n, int fixed_point_position)
+{
+    // n is the exponent of the scaling factor, that is scale = 1/2^n. Currently, we only support scaling factor equal to 1 => n = 0.
+    ARM_COMPUTE_ERROR_ON_MSG(n != 0, "Scaling factor different than 1 not supported for 8-bit fixed-point pixel-wise multiplication");
+    ARM_COMPUTE_UNUSED(n);
+
+    const auto input1 = static_cast<const qint8_t *__restrict>(input1_ptr);
+    const auto input2 = static_cast<const qint8_t *__restrict>(input2_ptr);
+    const auto output = static_cast<qint8_t *__restrict>(output_ptr);
+
+    const qint8x16_t ta1 = vld1q_qs8(input1);
+    const qint8x16_t ta2 = vld1q_qs8(input2);
+
+    qint8x16_t res = (is_sat) ? vqmulq_qs8(ta1, ta2, fixed_point_position) : vmulq_qs8(ta1, ta2, fixed_point_position);
+
+    vst1q_s8(output, res);
+}
+
+template <bool is_scale255, bool is_sat>
 inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &input1, const int16x8_t &input2, int n)
 {
     int32x4_t       tmp1_high = vmovl_s16(vget_high_s16(input1));
@@ -142,17 +161,28 @@
     }
     else
     {
+        // Right shift amount
         const int32x4_t vn = vdupq_n_s32(-n);
-
+        // Left shift amount
+        const int32x4_t vnl = vdupq_n_s32(n);
+        // Calculate conversion bit
+        const uint32x4_t tmp1_high_u  = vreinterpretq_u32_s32(tmp1_high);
+        const uint32x4_t tmp1_low_u   = vreinterpretq_u32_s32(tmp1_low);
+        const uint32x4_t sign_high    = vshrq_n_u32(tmp1_high_u, 31);
+        const uint32x4_t sign_low     = vshrq_n_u32(tmp1_low_u, 31);
+        const int32x4_t  sign_high_s  = vreinterpretq_s32_u32(sign_high);
+        const int32x4_t  sign_low_s   = vreinterpretq_s32_u32(sign_low);
+        const int32x4_t  convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
+        const int32x4_t  convert_low  = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
         if(is_sat)
         {
-            tmp1_high = vqshlq_s32(tmp1_high, vn);
-            tmp1_low  = vqshlq_s32(tmp1_low, vn);
+            tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
+            tmp1_low  = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
         }
         else
         {
-            tmp1_high = vshlq_s32(tmp1_high, vn);
-            tmp1_low  = vshlq_s32(tmp1_low, vn);
+            tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
+            tmp1_low  = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
         }
     }
 
@@ -297,17 +327,23 @@
 } // namespace
 
 NEPixelWiseMultiplicationKernel::NEPixelWiseMultiplicationKernel()
-    : _func_float(nullptr), _func_int(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
+    : _func_float(nullptr), _func_int(nullptr), _func_q_int(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
 {
 }
 
 void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
                              "Output can only be U8 if both inputs are U8");
+    if(output->info()->data_type() == DataType::QS8 || input1->info()->data_type() == DataType::QS8 || output->info()->data_type() == DataType::QS8)
+    {
+        // All data types must be QS8
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input1, input2, output);
+    }
 
     _input1         = input1;
     _input2         = input2;
@@ -315,13 +351,14 @@
     _scale          = scale;
     _scale_exponent = 0;
     _func_int       = nullptr;
+    _func_q_int     = nullptr;
     _func_float     = nullptr;
 
     bool is_scale_255 = false;
     // Check and validate scaling factor
     if(std::abs(scale - scale255_constant) < 0.00001f)
     {
-        ARM_COMPUTE_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
+        ARM_COMPUTE_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
         ARM_COMPUTE_UNUSED(rounding_policy);
 
         is_scale_255 = true;
@@ -409,6 +446,17 @@
             _func_int = is_sat ? &mul_U8_U8_S16_n<false, true> : &mul_U8_U8_S16_n<false, false>;
         }
     }
+    else if(DataType::QS8 == dt_input1 && DataType::QS8 == dt_input2 && DataType::QS8 == dt_output)
+    {
+        if(is_scale_255)
+        {
+            _func_q_int = is_sat ? &mul_QS8_QS8_QS8_n<true, true> : &mul_QS8_QS8_QS8_n<true, false>;
+        }
+        else
+        {
+            _func_q_int = is_sat ? &mul_QS8_QS8_QS8_n<false, true> : &mul_QS8_QS8_QS8_n<false, false>;
+        }
+    }
     else if(DataType::F32 == dt_input1 && DataType::F32 == dt_input2 && DataType::F32 == dt_output)
     {
         _func_float = &mul_F32_F32_F32_n<false, false>;
@@ -455,6 +503,15 @@
         },
         input1, input2, output);
     }
+    else if(_func_q_int != nullptr)
+    {
+        int fixed_point_position = _input1->info()->fixed_point_position();
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            (*_func_q_int)(input1.ptr(), input2.ptr(), output.ptr(), _scale_exponent, fixed_point_position);
+        },
+        input1, input2, output);
+    }
     else
     {
         ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 43d8743..30b67b6 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -25,8 +25,10 @@
 
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/FixedPoint.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
@@ -51,10 +53,23 @@
     int end_y   = std::min(start_y + pool_size, upper_bound_h);
     return 1.f / ((end_y - start_y) * (end_x - start_x));
 }
+
+inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
+                                      int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
+{
+    static std::array<qint8_t, 10> scale_values_q8 =
+    { { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } };
+    const int start_x = id.x() * stride_x - pad_x;
+    const int start_y = id.y() * stride_y - pad_y;
+    const int end_x   = std::min(start_x + pool_size, upper_bound_w);
+    const int end_y   = std::min(start_y + pool_size, upper_bound_h);
+    const int val     = ((end_y - start_y) * (end_x - start_x));
+    return scale_values_q8[val] >> (7 - fixed_point_position);
+}
 } // namespace
 
 NEPoolingLayerKernel::NEPoolingLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _border_size(0)
+    : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0)
 {
 }
 
@@ -78,11 +93,14 @@
     std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
 
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     ARM_COMPUTE_ERROR_ON(2 != pool_size && 3 != pool_size);
     ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
+    ARM_COMPUTE_ERROR_ON(input->info()->data_type() == DataType::QS8 && pool_type == PoolingType::AVG && input->info()->fixed_point_position() > 6);
+    ARM_COMPUTE_ERROR_ON(input->info()->data_type() == DataType::QS8 && pool_stride_x > 2);
 
     // Check output dimensions
     std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
@@ -92,11 +110,33 @@
     ARM_COMPUTE_UNUSED(pooled_h);
     ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h));
 
-    const int num_elems_read_per_iteration = (pool_size == 2) ? 2 : 4; // We use vload4 for pooling3
-    const int input_width                  = input->info()->dimension(0);
-    const int input_height                 = input->info()->dimension(1);
-    const int upper_bound_w                = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
-    const int upper_bound_h                = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+    unsigned int num_elems_read_per_iteration      = 0;
+    unsigned int num_elems_processed_per_iteration = 0;
+    unsigned int num_elems_horizontal_window       = 0;
+
+    // Select element size
+    switch(input->info()->data_type())
+    {
+        case DataType::QS8:
+            num_elems_read_per_iteration      = 16;
+            num_elems_processed_per_iteration = (pool_size == 2) ? 8 : 7;
+            num_elems_horizontal_window       = 8;
+            break;
+        case DataType::F32:
+            num_elems_read_per_iteration      = (pool_size == 2) ? 2 : 4; // We use vload4 for pooling3
+            num_elems_processed_per_iteration = 1;
+            num_elems_horizontal_window       = 1;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+    }
+
+    _num_elems_processed_per_iteration = num_elems_processed_per_iteration;
+    const int input_width              = input->info()->dimension(0);
+    const int input_height             = input->info()->dimension(1);
+    const int upper_bound_w            = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
+    const int upper_bound_h            = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
 
     // Set instance variables
     _input              = input;
@@ -110,10 +150,24 @@
     switch(pool_size)
     {
         case 2:
-            _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2<PoolingType::MAX>;
+            if(input->info()->data_type() == DataType::QS8)
+            {
+                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
+            }
+            else if(input->info()->data_type() == DataType::F32)
+            {
+                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX>;
+            }
             break;
         case 3:
-            _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3<PoolingType::MAX>;
+            if(input->info()->data_type() == DataType::QS8)
+            {
+                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
+            }
+            else if(input->info()->data_type() == DataType::F32)
+            {
+                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX>;
+            }
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported pooling size");
@@ -121,18 +175,61 @@
     }
 
     // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-
     Window                 win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
     AccessWindowStatic     input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_horizontal_window);
     update_window_and_padding(win, input_access, output_access);
     output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
     INEKernel::configure(win);
 }
 
 template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling2(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    const int     fixed_point_position = _input->info()->fixed_point_position();
+    constexpr int pool_size            = 2;
+    int           pool_pad_x           = 0;
+    int           pool_pad_y           = 0;
+    int           pool_stride_x        = 0;
+    int           pool_stride_y        = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto top_data    = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
+        const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
+        qint8x8_t  res         = {};
+        if(pooling_type == PoolingType::AVG)
+        {
+            // Calculate scale
+            const qint8_t   scale     = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
+            const qint8x8_t scale_vec = vdup_n_qs8(scale);
+
+            // Perform pooling
+            const qint8x16_t sum_data = vqaddq_qs8(top_data, bottom_data);
+            res                       = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position);
+        }
+        else
+        {
+            const qint8x16_t max_data = vmaxq_s8(top_data, bottom_data);
+            res                       = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data));
+        }
+        vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
+    },
+    input, output);
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -173,7 +270,80 @@
 }
 
 template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling3(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling3_q8(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    const int     fixed_point_position = _input->info()->fixed_point_position();
+    constexpr int pool_size            = 3;
+    int           pool_pad_x           = 0;
+    int           pool_pad_y           = 0;
+    int           pool_stride_x        = 0;
+    int           pool_stride_y        = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+    const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto top_data    = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
+        const auto middle_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_middle_ptr + input.offset()));
+        const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
+        qint8x8_t  res         = {};
+        if(pooling_type == PoolingType::AVG)
+        {
+            // Calculate scale
+            const qint8_t   scale     = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
+            const qint8x8_t scale_vec = vdup_n_qs8(scale);
+
+            // Perform pooling for stride 2
+            const qint8x16_t sum_data  = vqaddq_qs8(vqaddq_qs8(top_data, bottom_data), middle_data);
+            const qint8x16_t sum_data2 = vextq_s8(sum_data, sum_data, 1);
+            const qint8x16_t sum_data3 = vextq_s8(sum_data, sum_data, 2);
+            const qint8x16_t final_sum = vqaddq_qs8(vqaddq_qs8(sum_data, sum_data2), sum_data3);
+            if(pool_stride_x == 2)
+            {
+                const qint8x8x2_t      table      = { { vget_low_s8(final_sum), vget_high_s8(final_sum) } };
+                static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
+                res                               = vtbl2_s8(table, lookup_val);
+            }
+            else
+            {
+                res = vget_low_s8(final_sum);
+            }
+            res = vqmul_qs8(res, scale_vec, fixed_point_position);
+        }
+        else
+        {
+            const qint8x16_t max_data  = vmaxq_s8(vmaxq_s8(top_data, bottom_data), middle_data);
+            const qint8x16_t max_data2 = vextq_s8(max_data, max_data, 1);
+            const qint8x16_t max_data3 = vextq_s8(max_data, max_data, 2);
+            const qint8x16_t final_max = vmaxq_s8(vmaxq_s8(max_data, max_data2), max_data3);
+
+            if(pool_stride_x == 2)
+            {
+                const qint8x8x2_t      table      = { { vget_low_s8(final_max), vget_high_s8(final_max) } };
+                static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
+                res                               = vtbl2_s8(table, lookup_val);
+            }
+            else
+            {
+                res = vget_low_s8(final_max);
+            }
+        }
+        vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
+    },
+    input, output);
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window &window)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -227,8 +397,17 @@
     std::tie(pool_stride_x, pool_stride_y)    = _pool_info.pad_stride_info().stride();
 
     // Set step for input in x and y direction for the input
-    Window window_input(window);
-    window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, pool_stride_x));
+    Window       window_input(window);
+    unsigned int window_x_inc = 0;
+    if(_input->info()->data_type() == DataType::QS8)
+    {
+        window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
+    }
+    else
+    {
+        window_x_inc = pool_stride_x;
+    }
+    window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
     window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
 
     // Run function
diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
index a6e6cad..942662e 100644
--- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
@@ -40,8 +41,75 @@
 
 using namespace arm_compute;
 
+namespace
+{
+void logits_1d_max_f32(const ITensor *in, ITensor *out, const Window &window)
+{
+    Window in_slice = window.first_slice_window_1D();
+
+    Window window_max(window);
+    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+    Window max_slice = window_max.first_slice_window_1D();
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator output(out, max_slice);
+
+        float32x4_t vec_max = vdupq_n_f32(-FLT_MAX);
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto        in_ptr        = reinterpret_cast<const float *>(input.ptr());
+            const float32x4_t current_value = vld1q_f32(in_ptr);
+            vec_max                         = vmaxq_f32(vec_max, current_value);
+        },
+        input);
+
+        float32x2_t carry_max = vpmax_f32(vget_high_f32(vec_max), vget_low_f32(vec_max));
+        carry_max             = vpmax_f32(carry_max, carry_max);
+
+        *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(carry_max, 0);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+
+void logits_1d_max_qs8(const ITensor *in, ITensor *out, const Window &window)
+{
+    Window in_slice = window.first_slice_window_1D();
+
+    Window window_max(window);
+    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+    Window max_slice = window_max.first_slice_window_1D();
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator output(out, max_slice);
+
+        qint8x16_t vec_max = vdupq_n_s8(-1);
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto       in_ptr        = reinterpret_cast<const qint8_t *>(input.ptr());
+            const qint8x16_t current_value = vld1q_qs8(in_ptr);
+            vec_max                        = vmaxq_qs8(vec_max, current_value);
+        },
+        input);
+
+        qint8x8_t carry_max = vpmax_qs8(vget_high_s8(vec_max), vget_low_s8(vec_max));
+        carry_max           = vpmax_qs8(carry_max, carry_max);
+        carry_max           = vpmax_qs8(carry_max, carry_max);
+        carry_max           = vpmax_qs8(carry_max, carry_max);
+
+        *(reinterpret_cast<int8_t *>(output.ptr())) = vget_lane_s8(carry_max, 0);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+} // namespace
+
 NELogits1DMaxKernel::NELogits1DMaxKernel()
-    : _border_size()
+    : _func(nullptr), _border_size()
 {
 }
 
@@ -52,12 +120,26 @@
 
 void NELogits1DMaxKernel::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
-    const int              input_width                       = input->info()->valid_region().shape.x();
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
+    const int    input_width                       = input->info()->valid_region().shape.x();
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    switch(input->info()->data_type())
+    {
+        case DataType::QS8:
+            _func                             = &logits_1d_max_qs8;
+            num_elems_processed_per_iteration = 16;
+            break;
+        case DataType::F32:
+            num_elems_processed_per_iteration = 4;
+            _func                             = &logits_1d_max_f32;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
 
     _input       = input;
     _output      = output;
@@ -81,60 +163,170 @@
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    Window in_slice = window.first_slice_window_1D();
+    (*_func)(_input, _output, window);
+}
 
+namespace
+{
+void logits_1d_shift_exp_sum_f32(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window)
+{
     Window window_max(window);
     window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+
     Window max_slice = window_max.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_1D();
+
+    constexpr int step        = 4;
+    const int     long_steps  = in->info()->valid_region().shape.x() / step;
+    const int     small_steps = in->info()->valid_region().shape.x() % step;
 
     do
     {
-        Iterator in(_input, in_slice);
-        Iterator out(_output, max_slice);
+        Iterator input(in, in_slice);
+        Iterator exp(out, in_slice);
+        Iterator _max(max, max_slice);
+        Iterator _sum(sum, max_slice);
 
-        float32x4_t vec_max = vdupq_n_f32(-FLT_MAX);
+        // Get pointers
+        auto in_ptr  = reinterpret_cast<const float *>(input.ptr());
+        auto exp_ptr = reinterpret_cast<float *>(exp.ptr());
 
-        execute_window_loop(in_slice, [&](const Coordinates & id)
+        // Init sum to zero
+        float32x4_t vec_sum_value = vdupq_n_f32(0.0f);
+
+        // Get max value
+        const auto        max_ptr = reinterpret_cast<const float *>(_max.ptr());
+        const float32x4_t vec_max = vdupq_n_f32(*max_ptr);
+
+        // Run neon loop
+        for(int i = 0; i < long_steps; ++i)
         {
-            const auto        in_ptr        = reinterpret_cast<const float *>(in.ptr());
-            const float32x4_t current_value = vld1q_f32(in_ptr);
-            vec_max                         = vmaxq_f32(vec_max, current_value);
-        },
-        in);
+            float32x4_t vec_elements = vld1q_f32(in_ptr);
+            vec_elements             = vsubq_f32(vec_elements, vec_max);
+            vec_elements             = vexpq_f32(vec_elements);
 
-        float32x2_t carry_max = vpmax_f32(vget_high_f32(vec_max), vget_low_f32(vec_max));
-        carry_max             = vpmax_f32(carry_max, carry_max);
+            vst1q_f32(exp_ptr, vec_elements);
+            vec_sum_value = vaddq_f32(vec_elements, vec_sum_value);
 
-        *(reinterpret_cast<float *>(out.ptr())) = vget_lane_f32(carry_max, 0);
+            in_ptr += step;
+            exp_ptr += step;
+        }
+
+        // Reduce sum
+        float32x2_t carry_addition = vpadd_f32(vget_high_f32(vec_sum_value), vget_low_f32(vec_sum_value));
+        carry_addition             = vpadd_f32(carry_addition, carry_addition);
+        float sum                  = vget_lane_f32(carry_addition, 0);
+
+        // Run remaining elements
+        for(int i = 0; i < small_steps; ++i)
+        {
+            float element = std::exp(in_ptr[i] - *max_ptr);
+            exp_ptr[i]    = element;
+            sum += element;
+        }
+
+        *(reinterpret_cast<float *>(_sum.ptr())) = sum;
     }
     while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
 }
+void logits_1d_shift_exp_sum_qs8(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window)
+{
+    Window window_max(window);
+    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    Window max_slice = window_max.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_1D();
+
+    constexpr int step                 = 8;
+    const int     long_steps           = in->info()->valid_region().shape.x() / step;
+    const int     small_steps          = in->info()->valid_region().shape.x() % step;
+    const int     fixed_point_position = in->info()->fixed_point_position();
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator exp(out, in_slice);
+        Iterator _max(max, max_slice);
+        Iterator _sum(sum, max_slice);
+
+        // Get pointers
+        auto in_ptr  = reinterpret_cast<const qint8_t *>(input.ptr());
+        auto exp_ptr = reinterpret_cast<qint8_t *>(exp.ptr());
+
+        // Init sum to zero
+        qint16x8_t vec_sum_value = vdupq_n_qs16(0);
+
+        // Get max value
+        const auto      max_ptr = reinterpret_cast<const qint8_t *>(_max.ptr());
+        const qint8x8_t vec_max = vdup_n_qs8(*max_ptr);
+
+        // Run neon loop
+        for(int i = 0; i < long_steps; ++i)
+        {
+            qint8x8_t vec_elements = vld1_qs8(in_ptr);
+            vec_elements           = vqsub_qs8(vec_elements, vec_max);
+            vec_elements           = vqexp_qs8(vec_elements, fixed_point_position);
+
+            vst1_qs8(exp_ptr, vec_elements);
+            vec_sum_value = vqaddq_qs16(vec_sum_value, vmovl_s8(vec_elements));
+
+            in_ptr += step;
+            exp_ptr += step;
+        }
+        // Reduce sum
+        const qint16x4_t sum_red = vqadd_qs16(vget_low_s16(vec_sum_value), vget_high_s16(vec_sum_value));
+        const qint16_t   sum0    = sqadd_qs16(vget_lane_s16(sum_red, 0), vget_lane_s16(sum_red, 1));
+        const qint16_t   sum1    = sqadd_qs16(vget_lane_s16(sum_red, 2), vget_lane_s16(sum_red, 3));
+        qint16_t         sum     = sqadd_qs16(sum0, sum1);
+
+        // Run remaining elements
+        for(int i = 0; i < small_steps; ++i)
+        {
+            qint8_t element = sqexp_qs8(sqsub_qs8(in_ptr[i], *max_ptr), fixed_point_position);
+            exp_ptr[i]      = element;
+            sum             = sqadd_qs16(sum, element);
+        }
+
+        *(reinterpret_cast<qint8_t *>(_sum.ptr())) = sqmovn_qs16(sum);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+} //namespace
 
 NELogits1DShiftExpSumKernel::NELogits1DShiftExpSumKernel()
-    : _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr), _border_size(0)
+    : _func(nullptr), _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
 {
 }
 
-BorderSize NELogits1DShiftExpSumKernel::border_size() const
-{
-    return _border_size;
-}
 void NELogits1DShiftExpSumKernel::configure(const ITensor *input, const ITensor *max, ITensor *output, ITensor *sum)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(max, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, max);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(max, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, max, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, max, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(max, sum);
 
-    const int              input_width                       = input->info()->valid_region().shape.x();
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
+    unsigned int num_elems_processed_per_iteration = input->info()->valid_region().shape.x();
 
-    _input       = input;
-    _max         = max;
-    _output      = output;
-    _sum         = sum;
-    _border_size = BorderSize(0, input_width % num_elems_processed_per_iteration, 0, 0);
+    switch(input->info()->data_type())
+    {
+        case DataType::QS8:
+            _func = &logits_1d_shift_exp_sum_qs8;
+            break;
+        case DataType::F32:
+            _func = &logits_1d_shift_exp_sum_f32;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
+
+    _input  = input;
+    _max    = max;
+    _output = output;
+    _sum    = sum;
 
     // Configure kernel window
     Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
@@ -155,57 +347,87 @@
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    Window window_max(window);
-    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+    (*_func)(_input, _max, _output, _sum, window);
+}
 
-    Window max_slice = window_max.first_slice_window_1D();
+namespace
+{
+void logits_1d_norm_f32(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
+{
+    Window window_sum(window);
+    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+    Window sum_slice = window_sum.first_slice_window_1D();
     Window in_slice  = window.first_slice_window_1D();
 
     do
     {
-        Iterator in(_input, in_slice);
-        Iterator exp(_output, in_slice);
-        Iterator max(_max, max_slice);
-        Iterator sum(_sum, max_slice);
+        Iterator input(in, in_slice);
+        Iterator _sum(sum, sum_slice);
+        Iterator output(out, in_slice);
 
-        float32x4_t vec_sum_value = vdupq_n_f32(0.0f);
-
-        const auto  max_ptr = reinterpret_cast<const float *>(max.ptr());
-        float32x4_t vec_max = vdupq_n_f32(*max_ptr);
+        const float       sum_value        = *reinterpret_cast<const float *>(_sum.ptr());
+        const float32x4_t vec_sum_inversed = vdupq_n_f32(1.0f / sum_value);
 
         execute_window_loop(in_slice, [&](const Coordinates & id)
         {
-            const auto in_ptr  = reinterpret_cast<const float *>(in.ptr());
-            const auto exp_ptr = reinterpret_cast<float *>(exp.ptr());
+            const auto in_ptr  = reinterpret_cast<const float *>(input.ptr());
+            const auto out_ptr = reinterpret_cast<float *>(output.ptr());
 
-            float32x4_t vec_elements = vld1q_f32(in_ptr);
-            vec_elements             = vsubq_f32(vec_elements, vec_max);
-            vec_elements             = vexpq_f32(vec_elements);
+            const float32x4_t vec_in           = vld1q_f32(in_ptr);
+            const float32x4_t normalized_value = vmulq_f32(vec_in, vec_sum_inversed);
 
-            vst1q_f32(exp_ptr, vec_elements);
-
-            vec_sum_value = vaddq_f32(vec_elements, vec_sum_value);
+            vst1q_f32(out_ptr, normalized_value);
         },
-        in, exp);
-
-        float32x2_t carry_addition = vpadd_f32(vget_high_f32(vec_sum_value), vget_low_f32(vec_sum_value));
-        carry_addition             = vpadd_f32(carry_addition, carry_addition);
-
-        *(reinterpret_cast<float *>(sum.ptr())) = vget_lane_f32(carry_addition, 0);
+        input, output);
     }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
 }
+void logits_1d_norm_qs8(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
+{
+    Window window_sum(window);
+    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+    Window sum_slice = window_sum.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_1D();
+
+    const int fixed_point_position = in->info()->fixed_point_position();
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator _sum(sum, sum_slice);
+        Iterator output(out, in_slice);
+
+        const int8_t     sum_value        = *reinterpret_cast<const qint8_t *>(_sum.ptr());
+        const qint8x16_t vec_sum_inversed = vqrecipq_qs8(vdupq_n_qs8(sum_value), fixed_point_position);
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto in_ptr  = reinterpret_cast<const qint8_t *>(input.ptr());
+            const auto out_ptr = reinterpret_cast<qint8_t *>(output.ptr());
+
+            const qint8x16_t vec_in           = vld1q_qs8(in_ptr);
+            const qint8x16_t normalized_value = vqmulq_qs8(vec_in, vec_sum_inversed, fixed_point_position);
+
+            vst1q_qs8(out_ptr, normalized_value);
+        },
+        input, output);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+}
+} // namespace
 
 NELogits1DNormKernel::NELogits1DNormKernel()
-    : _input(nullptr), _sum(nullptr), _output(nullptr)
+    : _func(nullptr), _input(nullptr), _sum(nullptr), _output(nullptr)
 {
 }
 
 void NELogits1DNormKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, sum);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, sum);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
 
     _input  = input;
@@ -213,7 +435,21 @@
     _output = output;
 
     // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    switch(input->info()->data_type())
+    {
+        case DataType::QS8:
+            _func                             = &logits_1d_norm_qs8;
+            num_elems_processed_per_iteration = 16;
+            break;
+        case DataType::F32:
+            num_elems_processed_per_iteration = 4;
+            _func                             = &logits_1d_norm_f32;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
 
     Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
 
@@ -232,32 +468,7 @@
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    Window window_sum(window);
-    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
-    Window sum_slice = window_sum.first_slice_window_1D();
-    Window in_slice  = window.first_slice_window_1D();
-
-    do
-    {
-        Iterator in(_input, in_slice);
-        Iterator sum(_sum, sum_slice);
-        Iterator out(_output, in_slice);
-
-        float             sum_value        = *reinterpret_cast<const float *>(sum.ptr());
-        const float32x4_t vec_sum_inversed = vdupq_n_f32(1.0f / sum_value);
-
-        execute_window_loop(in_slice, [&](const Coordinates & id)
-        {
-            const auto in_ptr  = reinterpret_cast<const float *>(in.ptr());
-            const auto out_ptr = reinterpret_cast<float *>(out.ptr());
-
-            const float32x4_t vec_in           = vld1q_f32(in_ptr);
-            const float32x4_t normalized_value = vmulq_f32(vec_in, vec_sum_inversed);
-
-            vst1q_f32(out_ptr, normalized_value);
-        },
-        in, out);
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+    (*_func)(_input, _sum, _output, window);
 }
diff --git a/src/core/NEON/kernels/NETransposeKernel.cpp b/src/core/NEON/kernels/NETransposeKernel.cpp
index d0ef58f..492de8a 100644
--- a/src/core/NEON/kernels/NETransposeKernel.cpp
+++ b/src/core/NEON/kernels/NETransposeKernel.cpp
@@ -179,11 +179,20 @@
 
 void NETransposeKernel::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    TensorShape  output_shape{ input->info()->tensor_shape() };
+    const size_t w_out = input->info()->dimension(1);
+    const size_t h_out = input->info()->dimension(0);
+    output_shape.set(0, w_out);
+    output_shape.set(1, h_out);
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(1));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
 
     _input  = input;
     _output = output;
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
new file mode 100644
index 0000000..aa6be44
--- /dev/null
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
+
+#include "arm_compute/core/Dimensions.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+namespace
+{
+template <typename T>
+void weights_reshape(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window)
+{
+    const unsigned int kernel_size     = input->info()->dimension(0);
+    const unsigned int kernel_depth    = input->info()->dimension(2);
+    const unsigned int input_stride_x  = input->info()->strides_in_bytes().x();
+    const unsigned int input_stride_y  = input->info()->strides_in_bytes().y();
+    const unsigned int input_stride_z  = input->info()->strides_in_bytes().z();
+    const unsigned int output_stride_y = output->info()->strides_in_bytes().y();
+
+    // Create iterators
+    Iterator in(input, window);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Get column index
+        const int kernel_idx = id[3];
+        const int kernel_idz = id[4];
+
+        // Setup pointers
+        const uint8_t *tmp_input_ptr        = in.ptr();
+        uint8_t       *tmp_output_ptr       = output->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
+        const uint8_t *curr_input_row_ptr   = tmp_input_ptr;
+        const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
+
+        // Linearize volume
+        for(unsigned int d = 0; d < kernel_depth; ++d)
+        {
+            for(unsigned int j = 0; j < kernel_size; ++j)
+            {
+                for(unsigned int i = 0; i < kernel_size; ++i)
+                {
+                    *(reinterpret_cast<T *>(tmp_output_ptr)) = *(reinterpret_cast<const T *>(tmp_input_ptr));
+                    tmp_input_ptr += input_stride_x;
+                    tmp_output_ptr += output_stride_y;
+                }
+                curr_input_row_ptr += input_stride_y;
+                tmp_input_ptr = curr_input_row_ptr;
+            }
+            curr_input_depth_ptr += input_stride_z;
+            curr_input_row_ptr = curr_input_depth_ptr;
+            tmp_input_ptr      = curr_input_depth_ptr;
+        }
+
+        // Add bias
+        if(bias != nullptr)
+        {
+            *(reinterpret_cast<T *>(tmp_output_ptr)) = *(reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(kernel_idx, kernel_idz))));
+        }
+    },
+    in);
+}
+} // namespace
+
+NEWeightsReshapeKernel::NEWeightsReshapeKernel()
+    : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr)
+{
+}
+
+void NEWeightsReshapeKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != input->info()->dimension(1));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    TensorShape output_shape{ input->info()->tensor_shape() };
+    output_shape.collapse(3);
+    const size_t tmp_dim = output_shape[0];
+    output_shape.set(0, output_shape[1]);
+    output_shape.set(1, tmp_dim + (bias != nullptr ? 1 : 0));
+
+    // Set data type and shape for output tensor if not yet configured
+    set_data_type_if_unknown(*output->info(), dt);
+    set_fixed_point_position_if_zero(*output->info(), fixed_point_position);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+    if(bias != nullptr)
+    {
+        TensorShape bias_shape{ input->info()->tensor_shape()[3] };
+
+        // Set data type and shape for bias tensor if not yet configured
+        set_data_type_if_unknown(*bias->info(), dt);
+        set_fixed_point_position_if_zero(*bias->info(), fixed_point_position);
+        set_shape_if_empty(*bias->info(), bias_shape);
+
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(bias->info()->tensor_shape(), bias_shape);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::F32, DataType::QS8);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    _input  = input;
+    _bias   = bias;
+    _output = output;
+
+    switch(_input->info()->data_type())
+    {
+        case DataType::F32:
+        {
+            _func = &weights_reshape<uint32_t>;
+            break;
+        }
+        case DataType::QS8:
+        {
+            _func = &weights_reshape<uint8_t>;
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR_ON("Data type not supported");
+            break;
+        }
+    }
+
+    // Configure kernel
+    Window window = calculate_max_window(*input->info(), Steps());
+    window.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+    window.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
+    window.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
+
+    // The NEConvolutionLayerWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(window);
+}
+
+void NEWeightsReshapeKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    (*_func)(_input, _bias, _output, window);
+}
diff --git a/src/core/SubTensorInfo.cpp b/src/core/SubTensorInfo.cpp
new file mode 100644
index 0000000..f5a282d
--- /dev/null
+++ b/src/core/SubTensorInfo.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/SubTensorInfo.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+SubTensorInfo::SubTensorInfo()
+    : _parent(nullptr), _tensor_shape(), _coords(), _valid_region{ Coordinates(), _tensor_shape }
+{
+}
+
+SubTensorInfo::SubTensorInfo(ITensorInfo *parent, const TensorShape &tensor_shape, const Coordinates &coords)
+    : _parent(parent), _tensor_shape(tensor_shape), _coords(coords), _valid_region{ Coordinates(), _tensor_shape }
+{
+    ARM_COMPUTE_ERROR_ON(parent == nullptr);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(parent->tensor_shape(), coords, tensor_shape);
+
+    // Initialize valid region
+    Coordinates coordinates;
+    coordinates.set_num_dimensions(_tensor_shape.num_dimensions());
+    _valid_region = ValidRegion{ coordinates, _tensor_shape };
+}
+
+void SubTensorInfo::set_tensor_shape(TensorShape shape)
+{
+    ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(_parent->tensor_shape(), _coords, shape);
+    _tensor_shape = shape;
+}
+
+bool SubTensorInfo::extend_padding(const PaddingSize &padding)
+{
+    ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+    ARM_COMPUTE_ERROR_ON(!_parent->is_resizable());
+
+    // Extend parent padding if required
+    return _parent->extend_padding(padding);
+}
+
+size_t SubTensorInfo::offset_element_in_bytes(const Coordinates &pos) const
+{
+    ARM_COMPUTE_ERROR_ON_COORDINATES_DIMENSIONS_GTE(pos, _tensor_shape.num_dimensions());
+
+    size_t         offset  = offset_first_element_in_bytes();
+    const Strides &strides = strides_in_bytes();
+
+    for(size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
+    {
+        offset += pos[i] * strides[i];
+    }
+
+    return offset;
+}
diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index e81c0de..3d07ccb 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp
@@ -31,8 +31,29 @@
 using namespace arm_compute;
 
 TensorInfo::TensorInfo()
-    : _total_size(0), _fixed_point_pos(0), _offset_first_element_in_bytes(0), _strides_in_bytes(), _num_channels(0), _tensor_shape(), _data_type(DataType::UNKNOWN), _format(Format::UNKNOWN), _is_resizable{ true },
-      _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }
+    : _total_size(0), _fixed_point_position(0), _offset_first_element_in_bytes(0), _strides_in_bytes(), _num_channels(0), _tensor_shape(), _data_type(DataType::UNKNOWN), _format(Format::UNKNOWN),
+      _is_resizable{ true }, _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }
+{
+}
+
+TensorInfo::TensorInfo(const ITensorInfo &info)
+    : TensorInfo()
+{
+    _total_size                    = info.total_size();
+    _fixed_point_position          = info.fixed_point_position();
+    _offset_first_element_in_bytes = info.offset_first_element_in_bytes();
+    _strides_in_bytes              = info.strides_in_bytes();
+    _num_channels                  = info.num_channels();
+    _tensor_shape                  = info.tensor_shape();
+    _data_type                     = info.data_type();
+    _format                        = info.format();
+    _is_resizable                  = info.is_resizable();
+    _valid_region                  = info.valid_region();
+    _padding                       = info.padding();
+}
+
+TensorInfo::TensorInfo(Format format)
+    : TensorInfo(TensorShape(), format)
 {
 }
 
@@ -47,10 +68,16 @@
     init(tensor_shape, format);
 }
 
-TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, size_t fixed_point_pos)
+TensorInfo::TensorInfo(size_t num_channels, DataType data_type, size_t fixed_point_position)
     : TensorInfo()
 {
-    init(tensor_shape, num_channels, data_type, fixed_point_pos);
+    init(TensorShape(), num_channels, data_type, fixed_point_position);
+}
+
+TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position)
+    : TensorInfo()
+{
+    init(tensor_shape, num_channels, data_type, fixed_point_position);
 }
 
 TensorInfo::TensorInfo(const HOGInfo &hog_info, unsigned int width, unsigned int height)
@@ -59,6 +86,11 @@
     init(hog_info, width, height);
 }
 
+void TensorInfo::init(Format format)
+{
+    init(TensorShape(), format);
+}
+
 void TensorInfo::init(const TensorShape &tensor_shape, Format format)
 {
     size_t         num_channels = num_channels_from_format(format);
@@ -81,33 +113,34 @@
     _format = format;
 }
 
-void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, size_t fixed_point_pos)
+void TensorInfo::init(size_t num_channels, DataType data_type, size_t fixed_point_position)
 {
-    ARM_COMPUTE_ERROR_ON(0 == num_channels);
+    init(TensorShape(), num_channels, data_type, fixed_point_position);
+}
 
-    _fixed_point_pos               = fixed_point_pos;
-    _data_type                     = data_type;
-    _num_channels                  = num_channels;
-    _format                        = Format::UNKNOWN;
-    _tensor_shape                  = tensor_shape;
-    _offset_first_element_in_bytes = 0;
-    _strides_in_bytes              = compute_strides(*this);
+void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position)
+{
+    ARM_COMPUTE_ERROR_ON(num_channels == 0);
+    ARM_COMPUTE_ERROR_ON(data_type == DataType::QS8 && (fixed_point_position < 1 || fixed_point_position > 6));
+    ARM_COMPUTE_ERROR_ON(data_type == DataType::QS16 && (fixed_point_position < 1 || fixed_point_position > 14));
 
-    const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1;
-    _total_size                           = _tensor_shape[idx_last_dimension] * _strides_in_bytes[idx_last_dimension];
+    _fixed_point_position = fixed_point_position;
+    _data_type            = data_type;
+    _num_channels         = num_channels;
+    _format               = Format::UNKNOWN;
 
-    Coordinates coordinates;
-    coordinates.set_num_dimensions(_tensor_shape.num_dimensions());
-    _valid_region = ValidRegion{ coordinates, _tensor_shape };
+    set_tensor_shape(tensor_shape);
 }
 
 void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type,
                       const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
-                      size_t total_size_in_bytes, size_t fixed_point_pos)
+                      size_t total_size_in_bytes, int fixed_point_position)
 {
-    ARM_COMPUTE_ERROR_ON(0 == num_channels);
+    ARM_COMPUTE_ERROR_ON(num_channels == 0);
+    ARM_COMPUTE_ERROR_ON(data_type == DataType::QS8 && (fixed_point_position < 1 || fixed_point_position > 6));
+    ARM_COMPUTE_ERROR_ON(data_type == DataType::QS16 && (fixed_point_position < 1 || fixed_point_position > 14));
 
-    _fixed_point_pos               = fixed_point_pos;
+    _fixed_point_position          = fixed_point_position;
     _data_type                     = data_type;
     _num_channels                  = num_channels;
     _format                        = Format::UNKNOWN;
@@ -146,15 +179,17 @@
     return total_size;
 }
 
-size_t TensorInfo::init_auto_padding(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, size_t fixed_point_pos)
+size_t TensorInfo::init_auto_padding(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position)
 {
-    ARM_COMPUTE_ERROR_ON(0 == num_channels);
+    ARM_COMPUTE_ERROR_ON(num_channels == 0);
+    ARM_COMPUTE_ERROR_ON(data_type == DataType::QS8 && (fixed_point_position < 1 || fixed_point_position > 6));
+    ARM_COMPUTE_ERROR_ON(data_type == DataType::QS16 && (fixed_point_position < 1 || fixed_point_position > 14));
 
-    _fixed_point_pos = fixed_point_pos;
-    _data_type       = data_type;
-    _num_channels    = num_channels;
-    _format          = Format::UNKNOWN;
-    _tensor_shape    = tensor_shape;
+    _fixed_point_position = fixed_point_position;
+    _data_type            = data_type;
+    _num_channels         = num_channels;
+    _format               = Format::UNKNOWN;
+    _tensor_shape         = tensor_shape;
 
     Coordinates coordinates;
     coordinates.set_num_dimensions(_tensor_shape.num_dimensions());
@@ -183,10 +218,11 @@
 {
     ARM_COMPUTE_ERROR_ON(!_is_resizable);
 
-    /* Some kernels compute 32 elements at the time, worst case scenario they will read 32 values after the last element */
-    const size_t extra_pad_x = 32;
-    const size_t pad_x       = 4;
-    const size_t pad_y       = (_tensor_shape.num_dimensions() == 1 ? 0 : 4); // Skip pad_y if the tensor has just 1 dimension
+    // Some kernels compute 32 elements at the time, worst case scenario they
+    // will read 32 values after the last element
+    const size_t extra_pad_x = _tensor_shape.num_dimensions() < 1 ? 0 : 32;
+    const size_t pad_x       = _tensor_shape.num_dimensions() < 1 ? 0 : 4;
+    const size_t pad_y       = _tensor_shape.num_dimensions() < 2 ? 0 : 4;
 
     return extend_padding(PaddingSize(pad_y, pad_x + extra_pad_x, pad_y, pad_x));
 }
@@ -196,7 +232,7 @@
     // Calculate resulting stride for the X, Y and Z dimension
     const size_t stride_x = element_size();
     const size_t stride_y = (padding.left + _tensor_shape[0] + padding.right) * stride_x;
-    const size_t stride_z = _tensor_shape.num_dimensions() == 1 ? 0 : (padding.top + _tensor_shape[1] + padding.bottom) * stride_y;
+    const size_t stride_z = (padding.top + _tensor_shape[1] + padding.bottom) * stride_y;
 
     Strides      required_strides;
     size_t       required_total_size           = 0;
@@ -204,9 +240,18 @@
 
     switch(_tensor_shape.num_dimensions())
     {
+        case 0:
+        {
+            if(_tensor_shape.total_size() > 0)
+            {
+                required_strides    = Strides(stride_x);
+                required_total_size = stride_z;
+            }
+            break;
+        }
         case 1:
             required_strides    = compute_strides(*this, stride_x);
-            required_total_size = stride_y;
+            required_total_size = stride_z;
             break;
         case 2:
             required_strides    = compute_strides(*this, stride_x, stride_y);
@@ -261,12 +306,60 @@
     return updated;
 }
 
+void TensorInfo::set_data_type(DataType data_type)
+{
+    _data_type = data_type;
+    _format    = Format::UNKNOWN;
+}
+
+void TensorInfo::set_num_channels(int num_channels)
+{
+    _num_channels = num_channels;
+    _format       = Format::UNKNOWN;
+}
+
 void TensorInfo::set_format(Format format)
 {
-    ARM_COMPUTE_ERROR_ON(num_channels_from_format(format) != _num_channels);
-    ARM_COMPUTE_ERROR_ON(data_type_from_format(format) != _data_type);
-
     _format = format;
+
+    if(_data_type == DataType::UNKNOWN)
+    {
+        _num_channels = num_channels_from_format(format);
+        _data_type    = data_type_from_format(format);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON(num_channels_from_format(format) != _num_channels);
+        ARM_COMPUTE_ERROR_ON(data_type_from_format(format) != _data_type);
+    }
+}
+
+void TensorInfo::set_tensor_shape(TensorShape shape)
+{
+    _tensor_shape                  = shape;
+    _offset_first_element_in_bytes = 0;
+    _strides_in_bytes              = compute_strides(*this);
+
+    if(_tensor_shape.num_dimensions() == 0)
+    {
+        _total_size = _strides_in_bytes[0];
+    }
+    else
+    {
+        const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1;
+        _total_size                           = _tensor_shape[idx_last_dimension] * _strides_in_bytes[idx_last_dimension];
+    }
+
+    Coordinates coordinates;
+    coordinates.set_num_dimensions(_tensor_shape.num_dimensions());
+    _valid_region = ValidRegion{ coordinates, _tensor_shape };
+}
+
+void TensorInfo::set_fixed_point_position(int fixed_point_position)
+{
+    ARM_COMPUTE_ERROR_ON(_data_type == DataType::QS8 && (fixed_point_position < 1 || fixed_point_position > 6));
+    ARM_COMPUTE_ERROR_ON(_data_type == DataType::QS16 && (fixed_point_position < 1 || fixed_point_position > 14));
+    _fixed_point_position = fixed_point_position;
 }
 
 size_t TensorInfo::offset_element_in_bytes(const Coordinates &pos) const
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index a6ece09..bf005c1 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -23,6 +23,8 @@
  */
 #include "arm_compute/core/Utils.h"
 
+#include "arm_compute/core/FixedPoint.h"
+
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
@@ -128,8 +130,10 @@
         { DataType::UNKNOWN, "UNKNOWN" },
         { DataType::S8, "S8" },
         { DataType::U8, "U8" },
+        { DataType::QS8, "QS8" },
         { DataType::S16, "S16" },
         { DataType::U16, "U16" },
+        { DataType::QS16, "QS16" },
         { DataType::S32, "S32" },
         { DataType::U32, "U32" },
         { DataType::S64, "S64" },
@@ -210,10 +214,23 @@
     return border_mode_map[border_mode];
 }
 
-std::string arm_compute::lower_string(std::string val)
+const std::string &arm_compute::string_from_norm_type(NormType type)
 {
-    std::transform(val.begin(), val.end(), val.begin(), ::tolower);
-    return val;
+    static std::map<NormType, const std::string> norm_type_map =
+    {
+        { NormType::IN_MAP_1D, "IN_MAP_1D" },
+        { NormType::IN_MAP_2D, "IN_MAP_2D" },
+        { NormType::CROSS_MAP, "CROSS_MAP" },
+    };
+
+    return norm_type_map[type];
+}
+
+std::string arm_compute::lower_string(const std::string &val)
+{
+    std::string res = val;
+    std::transform(res.begin(), res.end(), res.begin(), ::tolower);
+    return res;
 }
 
 const std::pair<unsigned int, unsigned int> arm_compute::scaled_dimensions(unsigned int width, unsigned int height, unsigned int kernel_size,
@@ -259,6 +276,10 @@
         case DataType::U8:
             print_consecutive_elements_impl<uint8_t>(s, ptr, n, stream_width, element_delim);
             break;
+        case DataType::QS8:
+        case DataType::S8:
+            print_consecutive_elements_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n, stream_width, element_delim);
+            break;
         case DataType::U16:
             print_consecutive_elements_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n, stream_width, element_delim);
             break;
@@ -287,6 +308,9 @@
     {
         case DataType::U8:
             return max_consecutive_elements_display_width_impl<uint8_t>(s, ptr, n);
+        case DataType::QS8:
+        case DataType::S8:
+            return max_consecutive_elements_display_width_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n);
         case DataType::U16:
             return max_consecutive_elements_display_width_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n);
         case DataType::S16:
@@ -302,4 +326,4 @@
         default:
             ARM_COMPUTE_ERROR("Undefined element size for given data type");
     }
-}
\ No newline at end of file
+}
diff --git a/src/core/Validate.cpp b/src/core/Validate.cpp
index 1ab8dcc..ae2841d 100644
--- a/src/core/Validate.cpp
+++ b/src/core/Validate.cpp
@@ -84,7 +84,7 @@
 
     for(unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
     {
-        ARM_COMPUTE_ERROR_ON_LOC_MSG(win[i].start() != 0 || win[i].end() != 1,
+        ARM_COMPUTE_ERROR_ON_LOC_MSG(win[i].start() != 0 || win[i].end() != win[i].step(),
                                      function, file, line,
                                      "Maximum number of dimensions expected %u but dimension %u is not empty", max_dim, i);
     }
@@ -172,3 +172,44 @@
                                  function, file, line,
                                  "This kernel hasn't been configured.");
 }
+
+void arm_compute::error_on_invalid_subtensor(const char *function, const char *file, const int line,
+                                             const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+    ARM_COMPUTE_UNUSED(parent_shape);
+    ARM_COMPUTE_UNUSED(coords);
+    ARM_COMPUTE_UNUSED(shape);
+
+    // Subtensor should not index in x, y dimensions.
+    ARM_COMPUTE_ERROR_ON_LOC(((coords.x() != 0) && (coords.y() != 0)), function, file, line);
+    // Subtensor shape should match parent tensor in x, y dimensions.
+    ARM_COMPUTE_ERROR_ON_LOC(((parent_shape.x() != shape.x()) && (parent_shape.y() != parent_shape.y())), function, file, line);
+
+    // Check dimensions
+    for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC(((coords[i] >= static_cast<int>(parent_shape[i])) || (coords[i] + static_cast<int>(shape[i]) > static_cast<int>(parent_shape[i]))),
+                                 function, file, line);
+    }
+}
+
+void arm_compute::error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line,
+                                                          const ValidRegion &parent_valid_region, const ValidRegion &valid_region)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+    ARM_COMPUTE_UNUSED(parent_valid_region);
+    ARM_COMPUTE_UNUSED(valid_region);
+
+    // Check valid regions
+    for(unsigned int d = 0; d < TensorShape::num_max_dimensions; ++d)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC((parent_valid_region.anchor[d] > valid_region.anchor[d]), function, file, line);
+        ARM_COMPUTE_ERROR_ON_LOC((parent_valid_region.anchor[d] + static_cast<int>(parent_valid_region.shape[d])) < (valid_region.anchor[d] + static_cast<int>(valid_region.shape[d])),
+                                 function, file, line);
+    }
+}
diff --git a/src/runtime/CL/CLHOG.cpp b/src/runtime/CL/CLHOG.cpp
new file mode 100644
index 0000000..3f5266c
--- /dev/null
+++ b/src/runtime/CL/CLHOG.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/CLHOG.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLHOG::CLHOG()
+    : _info(), _buffer()
+{
+}
+
+void CLHOG::init(const HOGInfo &input)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
+    _info   = input;
+    _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info()->descriptor_size() * sizeof(float));
+}
+
+void CLHOG::free()
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+
+    _buffer = cl::Buffer();
+}
+
+const HOGInfo *CLHOG::info() const
+{
+    return &_info;
+}
+
+const cl::Buffer &CLHOG::cl_buffer() const
+{
+    return _buffer;
+}
+
+void CLHOG::map(bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(descriptor() != nullptr);
+    ICLHOG::map(CLScheduler::get().queue(), blocking);
+}
+
+void CLHOG::unmap()
+{
+    ARM_COMPUTE_ERROR_ON(descriptor() == nullptr);
+    ICLHOG::unmap(CLScheduler::get().queue());
+}
+
+uint8_t *CLHOG::do_map(cl::CommandQueue &q, bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+    return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->descriptor_size()));
+}
+
+void CLHOG::do_unmap(cl::CommandQueue &q)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+    q.enqueueUnmapMemObject(_buffer, descriptor());
+}
\ No newline at end of file
diff --git a/src/runtime/CL/CLMultiHOG.cpp b/src/runtime/CL/CLMultiHOG.cpp
new file mode 100644
index 0000000..b9e8739
--- /dev/null
+++ b/src/runtime/CL/CLMultiHOG.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLMultiHOG.h"
+
+#include "arm_compute/core/CL/ICLHOG.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+
+using namespace arm_compute;
+
+CLMultiHOG::CLMultiHOG(size_t num_models)
+    : _num_models(num_models), _model(arm_compute::cpp14::make_unique<CLHOG[]>(_num_models))
+{
+}
+
+size_t CLMultiHOG::num_models() const
+{
+    return _num_models;
+}
+
+ICLHOG *CLMultiHOG::cl_model(size_t index)
+{
+    ARM_COMPUTE_ERROR_ON(index >= _num_models);
+    return (_model.get() + index);
+}
+
+const ICLHOG *CLMultiHOG::cl_model(size_t index) const
+{
+    ARM_COMPUTE_ERROR_ON(index >= _num_models);
+    return (_model.get() + index);
+}
\ No newline at end of file
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index 1f3dbbe..fe25ce5 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -28,7 +28,7 @@
 using namespace arm_compute;
 
 CLScheduler::CLScheduler()
-    : _context(), _queue()
+    : _context(), _queue(), _target(GPUTarget::MIDGARD)
 {
 }
 
diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp
new file mode 100644
index 0000000..b228c0a
--- /dev/null
+++ b/src/runtime/CL/CLSubTensor.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLSubTensor.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords)
+    : _parent(nullptr), _info()
+{
+    ARM_COMPUTE_ERROR_ON(parent == nullptr);
+    _info   = SubTensorInfo(parent->info(), tensor_shape, coords);
+    _parent = parent;
+}
+
+ITensorInfo *CLSubTensor::info() const
+{
+    return &_info;
+}
+
+ITensorInfo *CLSubTensor::info()
+{
+    return &_info;
+}
+
+const cl::Buffer &CLSubTensor::cl_buffer() const
+{
+    ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+    return _parent->cl_buffer();
+}
+
+ICLTensor *CLSubTensor::parent()
+{
+    return _parent;
+}
+
+void CLSubTensor::map(bool blocking)
+{
+    ICLTensor::map(CLScheduler::get().queue(), blocking);
+}
+
+void CLSubTensor::unmap()
+{
+    ICLTensor::unmap(CLScheduler::get().queue());
+}
+
+uint8_t *CLSubTensor::do_map(cl::CommandQueue &q, bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr);
+    return static_cast<uint8_t *>(q.enqueueMapBuffer(cl_buffer(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->total_size()));
+}
+
+void CLSubTensor::do_unmap(cl::CommandQueue &q)
+{
+    ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr);
+    q.enqueueUnmapMemObject(cl_buffer(), buffer());
+}
diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
new file mode 100644
index 0000000..3df673c
--- /dev/null
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLBatchNormalizationLayer::CLBatchNormalizationLayer()
+    : _norm_kernel()
+{
+}
+
+void CLBatchNormalizationLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon)
+{
+    _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
+}
+
+void CLBatchNormalizationLayer::run()
+{
+    CLScheduler::get().enqueue(_norm_kernel, true);
+}
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index bb47bf9..f0bbc35 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -33,83 +33,155 @@
 
 using namespace arm_compute;
 
-CLConvolutionLayer::CLConvolutionLayer()
-    : _input_im2col_kernel(), _weights_reshape_kernel(), _input_interleave_kernel(), _weights_transposed_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(),
-      _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _is_first_run(false), _has_bias(false), _is_fc(false)
+CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights()
+    : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
 {
 }
 
-void CLConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose1xW)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
+    const bool _has_bias = (biases != nullptr);
+
+    _transpose1xW = transpose1xW;
+
+    if(transpose1xW)
+    {
+        // Create tensor to store the reshaped weights
+        const unsigned int mat_weights_cols = weights->info()->dimension(3);
+        const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+        TensorShape        shape_wr(mat_weights_cols, mat_weights_rows);
+        const DataType     dt = weights->info()->data_type();
+        TensorInfo         info_wr(shape_wr, 1, dt);
+
+        _weights_reshaped.allocator()->init(info_wr);
+        _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+        _weights_transposed_kernel.configure(&_weights_reshaped, output);
+        _weights_reshaped.allocator()->allocate();
+    }
+    else
+    {
+        _weights_reshape_kernel.configure(weights, biases, output);
+    }
+}
+
+void CLConvolutionLayerReshapeWeights::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+    CLScheduler::get().enqueue(_weights_reshape_kernel);
+    if(_transpose1xW)
+    {
+        CLScheduler::get().enqueue(_weights_transposed_kernel);
+    }
+}
+
+CLConvolutionLayer::CLConvolutionLayer()
+    : _reshape_weights(), _input_im2col_kernel(), _input_interleave_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(),
+      _weights_transposed(), _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+{
+}
+
+void CLConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
 
     if(biases != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
         ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
     }
 
-    _has_bias     = (biases != nullptr);
-    _is_first_run = true;
+    _has_bias             = (biases != nullptr);
+    _are_weights_reshaped = weights_info.are_reshaped();
 
     // Get parameters for conv_info
-    unsigned int stride_x, stride_y, pad_x, pad_y = 0;
+    unsigned int stride_x = 0;
+    unsigned int stride_y = 0;
+    unsigned int pad_x    = 0;
+    unsigned int pad_y    = 0;
     std::tie(stride_x, stride_y) = conv_info.stride();
     std::tie(pad_x, pad_y)       = conv_info.pad();
 
-    bool is_same_dimension = true;
-    // Make sure the input and weights have same low three dimensions
-    for(int i = 0; i < 3; i++)
-    {
-        is_same_dimension = (is_same_dimension) && (input->info()->dimension(i) == weights->info()->dimension(i));
-    }
-
-    // Run the fully connected path if is_same_dimension is true and conv_stride_x/conv_stride_y are 1, and conv_pad_x/conv_pad_y are 0 and skip col2im
-    _is_fc = (is_same_dimension) && ((stride_x & stride_y) == 1) && ((pad_x | pad_y) == 0);
-
     // Get convolved dimensions
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
+
+    const unsigned int kernel_width = _are_weights_reshaped ? weights_info.kernel_size() : weights->info()->dimension(0);
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width,
                                                  stride_x, stride_y, pad_x, pad_y, conv_info.round());
     ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
 
+    // Check if its a "fully connected" convolution
+    _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
+
     // Create tensor to store the reshaped weights
-    const size_t      mat_weights_cols = weights->info()->dimension(3);
-    const size_t      mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
-    const TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
-    _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
-
-    // Create tensor to store transposed weights
-    TensorShape shape_wt(mat_weights_rows * 4, static_cast<size_t>(std::ceil(mat_weights_cols / 4.f)));
-    TensorInfo  info_wt(shape_wt, 1, weights->info()->data_type());
-    _weights_transposed.allocator()->init(info_wt);
-
+    size_t mat_weights_cols = weights->info()->dimension(3);
+    size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
+    if(_are_weights_reshaped)
+    {
+        mat_weights_cols                         = output->info()->dimension(2);
+        const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
+        mat_weights_rows                         = (_has_bias ? 1 + quarter_reshaped_cols : quarter_reshaped_cols);
+    }
+    else
+    {
+        if(_is_fully_connected_convolution)
+        {
+            // Create tensor to store the reshaped weights
+            TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
+            TensorInfo  info_wr(shape_wr, 1, weights->info()->data_type());
+            _weights_reshaped.allocator()->init(info_wr);
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, false);
+            weights = &_weights_reshaped;
+        }
+        else
+        {
+            // Create tensor to store transposed weights
+            TensorShape shape_wt(mat_weights_rows * 4, static_cast<size_t>(std::ceil(mat_weights_cols / 4.f)));
+            TensorInfo  info_wt(shape_wt, 1, weights->info()->data_type());
+            _weights_transposed.allocator()->init(info_wt);
+            _reshape_weights.configure(weights, biases, &_weights_transposed, true);
+            weights = &_weights_transposed;
+        }
+    }
     // Create tensor to store im2col reshaped inputs
     const size_t mat_input_cols = mat_weights_rows;
-    const size_t mat_input_rows = _is_fc ? (input->info()->dimension(3)) : (conv_w * conv_h);
+    const size_t mat_input_rows = conv_w * conv_h;
     TensorShape  shape_im2col   = input->info()->tensor_shape();
     shape_im2col.set(0, mat_input_cols);
     shape_im2col.set(1, mat_input_rows);
     shape_im2col.set(2, 1);
-    if(_is_fc)
-    {
-        shape_im2col.set(3, 1);
-    }
     _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
 
-    // Create tensor to prepare input tensor for GEMM
-    TensorShape shape_interleaved = shape_im2col;
-    shape_interleaved.set(0, shape_interleaved.x() * 4);
-    shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
-    _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
+    // Create tensor (interleave) to prepare input tensor for GEMM
+    if(!_is_fully_connected_convolution)
+    {
+        TensorShape shape_interleaved = shape_im2col;
+        shape_interleaved.set(0, shape_interleaved.x() * 4);
+        shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4.f));
+        _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
+    }
 
     // Create GEMM output tensor
     TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
@@ -119,48 +191,57 @@
 
     // Configure kernels
     _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
-    _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
-    _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
-    _weights_transposed_kernel.configure(&_weights_reshaped, &_weights_transposed);
-    if(_is_fc)
+    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+
+    if(_is_fully_connected_convolution)
     {
-        _mm_kernel.configure(&_input_interleaved_reshaped, &_weights_transposed, output, 1.0f);
+        _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
     }
     else
     {
-        _mm_kernel.configure(&_input_interleaved_reshaped, &_weights_transposed, &_gemm_output, 1.0f);
-        _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+        _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
+        _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
     }
 
-    // Allocate intermediate tensors
-    _weights_reshaped.allocator()->allocate();
-    _weights_transposed.allocator()->allocate();
+    if(!_are_weights_reshaped)
+    {
+        if(!_is_fully_connected_convolution)
+        {
+            _weights_transposed.allocator()->allocate();
+        }
+        else
+        {
+            _weights_reshaped.allocator()->allocate();
+        }
+    }
+
     _input_im2col_reshaped.allocator()->allocate();
-    _input_interleaved_reshaped.allocator()->allocate();
+    if(!_is_fully_connected_convolution)
+    {
+        _input_interleaved_reshaped.allocator()->allocate();
+    }
     _gemm_output.allocator()->allocate();
 }
 
 void CLConvolutionLayer::run()
 {
     // Run weights reshaping (Runs once for every configure)
-    if(_is_first_run)
+    if(!_are_weights_reshaped)
     {
-        _is_first_run = false;
-        CLScheduler::get().enqueue(_weights_reshape_kernel);
-        CLScheduler::get().enqueue(_weights_transposed_kernel);
+        _are_weights_reshaped = true;
+        _reshape_weights.run();
     }
 
     // Run input reshaping
     CLScheduler::get().enqueue(_input_im2col_kernel);
-    CLScheduler::get().enqueue(_input_interleave_kernel);
+    if(!_is_fully_connected_convolution)
+    {
+        CLScheduler::get().enqueue(_input_interleave_kernel);
+    }
 
     // Runs matrix multiply on reshaped matrices
     CLScheduler::get().enqueue(_mm_kernel);
 
     // Reshape output matrix
-
-    if(!_is_fc)
-    {
-        CLScheduler::get().enqueue(_output_col2im_kernel, false);
-    }
+    CLScheduler::get().enqueue(_output_col2im_kernel, false);
 }
diff --git a/src/runtime/CL/functions/CLDepthConcatenate.cpp b/src/runtime/CL/functions/CLDepthConcatenate.cpp
new file mode 100644
index 0000000..d967d98
--- /dev/null
+++ b/src/runtime/CL/functions/CLDepthConcatenate.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLDepthConcatenate::CLDepthConcatenate()
+    : _inputs_vector(), _concat_kernels_vector(), _border_handlers_vector(), _num_inputs(0)
+{
+}
+
+void CLDepthConcatenate::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
+
+    _num_inputs = inputs_vector.size();
+
+    unsigned int depth_offset = 0;
+
+    _concat_kernels_vector  = arm_compute::cpp14::make_unique<CLDepthConcatenateKernel[]>(_num_inputs);
+    _border_handlers_vector = arm_compute::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
+
+    for(unsigned int i = 0; i < _num_inputs; i++)
+    {
+        _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
+        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+        depth_offset += inputs_vector.at(i)->info()->dimension(2);
+    }
+}
+
+void CLDepthConcatenate::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+
+    for(unsigned i = 0; i < _num_inputs; i++)
+    {
+        CLScheduler::get().enqueue(_border_handlers_vector[i], false);
+        CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
+    }
+}
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 08e18df..57d57d5 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -31,35 +31,99 @@
 
 using namespace arm_compute;
 
+CLFullyConnectedLayerReshapeWeights::CLFullyConnectedLayerReshapeWeights()
+    : _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
+{
+}
+
+void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output, bool transpose_weights, bool is_batched_fc_layer)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != 2);
+    ARM_COMPUTE_ERROR_ON((transpose_weights == false) && (is_batched_fc_layer == false));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    _transpose_weights   = transpose_weights;
+    _is_batched_fc_layer = is_batched_fc_layer;
+
+    // Check if we need to transpose the weights
+    if(_transpose_weights)
+    {
+        if(_is_batched_fc_layer)
+        {
+            // Initialize the output tensor for transpose
+            TensorShape shape_transposed(input->info()->dimension(1), input->info()->dimension(0));
+            _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, dt, fixed_point_position));
+            _transpose_kernel.configure(input, &_transpose_output);
+
+            // Configure transpose 1xW kernel
+            _transpose1xW_kernel.configure(&_transpose_output, output);
+
+            // Allocate temporary tensor used for transposing the weights
+            _transpose_output.allocator()->allocate();
+        }
+        else
+        {
+            _transpose_kernel.configure(input, output);
+        }
+    }
+    else
+    {
+        if(_is_batched_fc_layer)
+        {
+            // Configure transpose 1xW kernel
+            _transpose1xW_kernel.configure(input, output);
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("Configuration transpose_weights=false & is_batched_fc_layer=false not supported");
+        }
+    }
+}
+
+void CLFullyConnectedLayerReshapeWeights::run()
+{
+    if(_transpose_weights)
+    {
+        CLScheduler::get().enqueue(_transpose_kernel, _is_batched_fc_layer);
+    }
+    if(_is_batched_fc_layer)
+    {
+        CLScheduler::get().enqueue(_transpose1xW_kernel);
+    }
+}
+
 CLFullyConnectedLayer::CLFullyConnectedLayer()
-    : _im2col_kernel(), _transpose_kernel(), _transpose1xW_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _transpose_output(),
-      _transpose1xW_output(), _is_first_run(true), _transpose_weights(true), _fc_after_conv(true), _batched_fc_layer(false), _accumulate_biases(false)
+    : _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _reshape_weights_output(),
+      _are_weights_reshaped(true), _is_fc_after_conv(true), _is_batched_fc_layer(false), _accumulate_biases(false)
 {
 }
 
 void CLFullyConnectedLayer::configure_conv_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2) * (16 / weights->info()->element_size())));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
 
     // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
 
     // Initialize output tensor for im2col
     TensorShape shape_im2col;
-    shape_im2col.set(0, weights->info()->dimension(1));
+    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
     shape_im2col.set(1, input->info()->dimension(3));
     shape_im2col.set(2, input->info()->dimension(4));
     shape_im2col.set(3, input->info()->dimension(5));
-    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
 
     // Initialize output tensor for interleave 4x4
     TensorShape shape_interleaved = _im2col_output.info()->tensor_shape();
     shape_interleaved.set(0, shape_interleaved.x() * 4);
     shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
-    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
-
-    // Initialize output tensor for transpose 1xW
-    TensorShape shape_transposed1xW(weights->info()->dimension(1) * 4, static_cast<size_t>(std::ceil(weights->info()->dimension(0) / 4.f)));
-    _transpose1xW_output.allocator()->init(TensorInfo(shape_transposed1xW, 1, weights->info()->data_type()));
+    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
 
     // Configure im2col kernel
     _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
@@ -67,55 +131,49 @@
     // Configure interleave4x4 kernel
     _interleave4x4_kernel.configure(&_im2col_output, &_interleave4x4_output);
 
-    // Configure transpose 1xW kernel
-    _transpose1xW_kernel.configure(weights, &_transpose1xW_output);
-
     // Configure matrix multiply kernel
-    _mm_kernel.configure(&_interleave4x4_output, &_transpose1xW_output, output, 1.0f);
+    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
 
     // Allocate the tensors once all the configure methods have been called
     _im2col_output.allocator()->allocate();
     _interleave4x4_output.allocator()->allocate();
-    _transpose1xW_output.allocator()->allocate();
 }
 
 void CLFullyConnectedLayer::configure_fc_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
 {
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
     // Initialize output tensor for interleave 4x4
     TensorShape shape_interleaved = input->info()->tensor_shape();
     shape_interleaved.set(0, shape_interleaved.x() * 4);
     shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
-    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
-
-    // Initialize output tensor for transpose 1xW
-    TensorShape shape_transposed1xW(weights->info()->dimension(1) * 4, static_cast<size_t>(std::ceil(weights->info()->dimension(0) / 4.f)));
-    _transpose1xW_output.allocator()->init(TensorInfo(shape_transposed1xW, 1, weights->info()->data_type()));
+    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
 
     // Configure interleave4x4 kernel
     _interleave4x4_kernel.configure(input, &_interleave4x4_output);
 
-    // Configure transpose 1xW kernel
-    _transpose1xW_kernel.configure(weights, &_transpose1xW_output);
-
     // Configure matrix multiply kernel
-    _mm_kernel.configure(&_interleave4x4_output, &_transpose1xW_output, output, 1.0f);
+    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
 
     // Allocate the tensors once all the configure methods have been called
     _interleave4x4_output.allocator()->allocate();
-    _transpose1xW_output.allocator()->allocate();
 }
 
 void CLFullyConnectedLayer::configure_conv_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
 {
     ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
 
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
     // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
 
     // Initialize output tensor for im2col
     TensorShape shape_im2col;
-    shape_im2col.set(0, weights->info()->dimension(1));
+    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
     shape_im2col.set(1, 1);
-    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
 
     // Configure im2col kernel
     _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
@@ -135,20 +193,20 @@
     _mm_kernel.configure(input, weights, output, 1.0f);
 }
 
-void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights)
+void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights, bool are_weights_reshaped)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
 
-    const ICLTensor *weights_to_use = weights;
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
 
-    _is_first_run      = true;
-    _transpose_weights = transpose_weights;
-    _fc_after_conv     = true;
-    _batched_fc_layer  = false;
-    _accumulate_biases = false;
+    _are_weights_reshaped = are_weights_reshaped;
+    _is_fc_after_conv     = true;
+    _is_batched_fc_layer  = false;
+    _accumulate_biases    = false;
 
     if(biases != nullptr)
     {
@@ -160,17 +218,6 @@
         _accumulate_biases_kernel.configure(output, biases);
     }
 
-    // Check if we need to transpose the weights
-    if(_transpose_weights)
-    {
-        // Initialize the output tensor for transpose
-        TensorShape shape_transposed(weights->info()->dimension(1), weights->info()->dimension(0));
-        _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, weights->info()->data_type()));
-        _transpose_kernel.configure(weights, &_transpose_output);
-
-        weights_to_use = &_transpose_output;
-    }
-
     // With the Fully Connected layer we can have 4 different cases:
     //  1) Convolution layer -> Fully Connected layer without batches
     //  2) Fully Connected layer -> Fully Connected layer without batches
@@ -178,15 +225,54 @@
     //  4) Fully Connected layer -> Fully Connected layer with batches
 
     // Check if we have a fully connected layer with batches
-    _batched_fc_layer = (output->info()->dimension(1) > 1);
+    _is_batched_fc_layer = (output->info()->dimension(1) > 1);
 
-    if(_batched_fc_layer)
+    const ICLTensor *weights_to_use = weights;
+
+    if(!are_weights_reshaped)
     {
-        _fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
-                                                                               input->info()->tensor_shape().cend(),
-                                                                               output->info()->tensor_shape().cbegin() + 1));
+        if((transpose_weights || _is_batched_fc_layer))
+        {
+            weights_to_use = &_reshape_weights_output;
 
-        if(_fc_after_conv)
+            if(transpose_weights)
+            {
+                if(_is_batched_fc_layer)
+                {
+                    const float transpose_width = 16.0f / input->info()->element_size();
+                    TensorShape shape_wt(weights->info()->dimension(0) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(1) / transpose_width)));
+                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                    _reshape_weights_output.allocator()->init(info_wt);
+                }
+                else
+                {
+                    TensorShape shape_wt(weights->info()->dimension(1), weights->info()->dimension(0));
+                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                    _reshape_weights_output.allocator()->init(info_wt);
+                }
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR_ON(!_is_batched_fc_layer);
+
+                const float transpose_width = 16.0f / input->info()->element_size();
+                TensorShape shape_wt(weights->info()->dimension(1) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(0) / transpose_width)));
+                TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                _reshape_weights_output.allocator()->init(info_wt);
+            }
+
+            // Reshape the weights
+            _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
+        }
+    }
+
+    if(_is_batched_fc_layer)
+    {
+        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                                                                  input->info()->tensor_shape().cend(),
+                                                                                  output->info()->tensor_shape().cbegin() + 1));
+
+        if(_is_fc_after_conv)
         {
             // Fully Connected layer after a Convolution Layer with batches
             configure_conv_fc_wb(input, weights_to_use, output);
@@ -199,9 +285,10 @@
     }
     else
     {
-        _fc_after_conv = (weights_to_use->info()->dimension(1) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+        // In case of not batched fully connected layer, the weights will not be reshaped using transposed1xW
+        _is_fc_after_conv = ((weights_to_use->info()->dimension(1)) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
 
-        if(_fc_after_conv)
+        if(_is_fc_after_conv)
         {
             // Fully Connected layer after a Convolution Layer without batches
             configure_conv_fc_nb(input, weights_to_use, output);
@@ -213,39 +300,34 @@
         }
     }
 
-    // Allocate the transpose tensor if the transpose_weights flag is true and once all the configure methods have been called
-    if(_transpose_weights)
+    // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
+    if(!are_weights_reshaped)
     {
-        _transpose_output.allocator()->allocate();
+        if(transpose_weights || _is_batched_fc_layer)
+        {
+            // Allocate the tensor for the weights reshaped
+            _reshape_weights_output.allocator()->allocate();
+        }
     }
 }
 
 void CLFullyConnectedLayer::run()
 {
-    // The reshape of the weights happens only once
-    if(_is_first_run)
+    // Reshape of the weights (happens only once)
+    if(!_are_weights_reshaped)
     {
-        _is_first_run = false;
-
-        if(_transpose_weights)
-        {
-            CLScheduler::get().enqueue(_transpose_kernel);
-        }
-
-        if(_batched_fc_layer)
-        {
-            CLScheduler::get().enqueue(_transpose1xW_kernel);
-        }
+        _are_weights_reshaped = true;
+        _reshape_weights_kernel.run();
     }
 
     // Linearize input if it comes from a convolutional layer
-    if(_fc_after_conv)
+    if(_is_fc_after_conv)
     {
         CLScheduler::get().enqueue(_im2col_kernel, false);
     }
 
     // Interleave input
-    if(_batched_fc_layer)
+    if(_is_batched_fc_layer)
     {
         CLScheduler::get().enqueue(_interleave4x4_kernel, false);
     }
diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
new file mode 100644
index 0000000..b1b5a03
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGDescriptor.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLHOGDescriptor::CLHOGDescriptor()
+    : _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
+{
+}
+
+void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    ARM_COMPUTE_ERROR_ON(nullptr == hog);
+
+    const HOGInfo *hog_info = hog->info();
+    const size_t   width    = input->info()->dimension(Window::DimX);
+    const size_t   height   = input->info()->dimension(Window::DimY);
+    const size_t   num_bins = hog_info->num_bins();
+
+    Size2D cell_size = hog_info->cell_size();
+
+    // Calculate number of cells along the x and y directions for the hog_space
+    const size_t num_cells_x = width / cell_size.width;
+    const size_t num_cells_y = height / cell_size.height;
+
+    // TensorShape of the input image
+    const TensorShape &shape_img = input->info()->tensor_shape();
+
+    // TensorShape of the hog space
+    TensorShape shape_hog_space = input->info()->tensor_shape();
+    shape_hog_space.set(Window::DimX, num_cells_x);
+    shape_hog_space.set(Window::DimY, num_cells_y);
+
+    // Intitialize tensors for magnitude, phase and hog space
+    TensorInfo info_mag(shape_img, Format::S16);
+    _mag.allocator()->init(info_mag);
+
+    TensorInfo info_phase(shape_img, Format::U8);
+    _phase.allocator()->init(info_phase);
+
+    TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
+    _hog_space.allocator()->init(info_space);
+
+    // Initialise gradient kernel
+    _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
+
+    // Initialise orientation binning kernel
+    _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
+
+    // Initialize HOG norm kernel
+    _block_norm.configure(&_hog_space, output, hog->info());
+
+    // Allocate intermediate tensors
+    _mag.allocator()->allocate();
+    _phase.allocator()->allocate();
+    _hog_space.allocator()->allocate();
+}
+
+void CLHOGDescriptor::run()
+{
+    // Run gradient
+    _gradient.run();
+
+    // Run orientation binning
+    CLScheduler::get().enqueue(_orient_bin, false);
+
+    // Run block normalization
+    CLScheduler::get().enqueue(_block_norm);
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGDetector.cpp b/src/runtime/CL/functions/CLHOGDetector.cpp
new file mode 100644
index 0000000..8eb5e42
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGDetector.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
+
+#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <algorithm>
+
+using namespace arm_compute;
+
+CLHOGDetector::CLHOGDetector()
+    : _hog_detector_kernel(), _detection_windows(nullptr), _num_detection_windows()
+{
+}
+
+void CLHOGDetector::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
+{
+    _detection_windows = detection_windows;
+
+    // Allocate buffer for storing the number of detected objects
+    _num_detection_windows = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int));
+
+    // Configure HOGDetectorKernel
+    _hog_detector_kernel.configure(input, hog, detection_windows, &_num_detection_windows, detection_window_stride, threshold, idx_class);
+}
+
+void CLHOGDetector::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+
+    // Reset number of detections
+    const unsigned int init_num_detection_windows = _detection_windows->num_values();
+    q.enqueueWriteBuffer(_num_detection_windows, CL_FALSE, 0, sizeof(unsigned int), &init_num_detection_windows);
+
+    // Run CLHOGDetectorKernel
+    CLScheduler::get().enqueue(_hog_detector_kernel);
+
+    // Read number of detections
+    unsigned int num_detection_windows = 0;
+    q.enqueueReadBuffer(_num_detection_windows, CL_TRUE, 0, sizeof(unsigned int), &num_detection_windows);
+
+    // Update the number of values stored in _detection_windows
+    _detection_windows->resize(static_cast<size_t>(num_detection_windows));
+
+    q.flush();
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
new file mode 100644
index 0000000..2387474
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGGradient.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLHOGGradient::CLHOGGradient()
+    : _derivative(), _mag_phase(), _gx(), _gy()
+{
+}
+
+void CLHOGGradient::configure(ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_magnitude, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_phase, 1, DataType::U8);
+
+    const TensorShape &shape_img = input->info()->tensor_shape();
+
+    // Allocate image memory
+    TensorInfo info(shape_img, Format::S16);
+    _gx.allocator()->init(info);
+    _gy.allocator()->init(info);
+
+    // Initialise derivate kernel
+    _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
+
+    // Initialise magnitude/phase kernel
+    if(PhaseType::UNSIGNED == phase_type)
+    {
+        _mag_phase.configure(&_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::UNSIGNED);
+    }
+    else
+    {
+        _mag_phase.configure(&_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::SIGNED);
+    }
+
+    // Allocate intermediate tensors
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+}
+
+void CLHOGGradient::run()
+{
+    // Run derivative
+    _derivative.run();
+
+    // Run magnitude/phase kernel
+    CLScheduler::get().enqueue(_mag_phase);
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
new file mode 100644
index 0000000..b8f2224
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGMultiDetection.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+using namespace arm_compute;
+
+CLHOGMultiDetection::CLHOGMultiDetection()
+    : _gradient_kernel(), _orient_bin_kernel(), _block_norm_kernel(), _hog_detect_kernel(), _non_maxima_kernel(), _hog_space(), _hog_norm_space(), _detection_windows(), _mag(), _phase(),
+      _non_maxima_suppression(false), _num_orient_bin_kernel(0), _num_block_norm_kernel(0), _num_hog_detect_kernel(0)
+{
+}
+
+void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode,
+                                    uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(multi_hog);
+    ARM_COMPUTE_ERROR_ON(nullptr == detection_windows);
+    ARM_COMPUTE_ERROR_ON(detection_window_strides->num_values() != multi_hog->num_models());
+
+    const size_t       width      = input->info()->dimension(Window::DimX);
+    const size_t       height     = input->info()->dimension(Window::DimY);
+    const TensorShape &shape_img  = input->info()->tensor_shape();
+    const size_t       num_models = multi_hog->num_models();
+    PhaseType          phase_type = multi_hog->model(0)->info()->phase_type();
+
+    size_t prev_num_bins     = multi_hog->model(0)->info()->num_bins();
+    Size2D prev_cell_size    = multi_hog->model(0)->info()->cell_size();
+    Size2D prev_block_size   = multi_hog->model(0)->info()->block_size();
+    Size2D prev_block_stride = multi_hog->model(0)->info()->block_stride();
+
+    /* Check if CLHOGOrientationBinningKernel and CLHOGBlockNormalizationKernel kernels can be skipped for a specific HOG data-object
+     *
+     * 1) CLHOGOrientationBinningKernel and CLHOGBlockNormalizationKernel are skipped if the cell size and the number of bins don't change.
+     *        Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
+     * 2) CLHOGBlockNormalizationKernel is skipped if the cell size, the number of bins and block size do not change.
+     *         Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
+     *
+     * @note Since the orientation binning and block normalization kernels can be skipped, we need to keep track of the input to process for each kernel
+     *       with "input_orient_bin", "input_hog_detect" and "input_block_norm"
+     */
+    std::vector<size_t> input_orient_bin;
+    std::vector<size_t> input_hog_detect;
+    std::vector<std::pair<size_t, size_t>> input_block_norm;
+
+    input_orient_bin.push_back(0);
+    input_hog_detect.push_back(0);
+    input_block_norm.emplace_back(0, 0);
+
+    for(size_t i = 1; i < num_models; ++i)
+    {
+        size_t cur_num_bins     = multi_hog->model(i)->info()->num_bins();
+        Size2D cur_cell_size    = multi_hog->model(i)->info()->cell_size();
+        Size2D cur_block_size   = multi_hog->model(i)->info()->block_size();
+        Size2D cur_block_stride = multi_hog->model(i)->info()->block_stride();
+
+        if((cur_num_bins != prev_num_bins) || (cur_cell_size.width != prev_cell_size.width) || (cur_cell_size.height != prev_cell_size.height))
+        {
+            prev_num_bins     = cur_num_bins;
+            prev_cell_size    = cur_cell_size;
+            prev_block_size   = cur_block_size;
+            prev_block_stride = cur_block_stride;
+
+            // Compute orientation binning and block normalization kernels. Update input to process
+            input_orient_bin.push_back(i);
+            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
+        }
+        else if((cur_block_size.width != prev_block_size.width) || (cur_block_size.height != prev_block_size.height) || (cur_block_stride.width != prev_block_stride.width)
+                || (cur_block_stride.height != prev_block_stride.height))
+        {
+            prev_block_size   = cur_block_size;
+            prev_block_stride = cur_block_stride;
+
+            // Compute block normalization kernel. Update input to process
+            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
+        }
+
+        // Update input to process for hog detector kernel
+        input_hog_detect.push_back(input_block_norm.size() - 1);
+    }
+
+    _detection_windows      = detection_windows;
+    _non_maxima_suppression = non_maxima_suppression;
+    _num_orient_bin_kernel  = input_orient_bin.size(); // Number of CLHOGOrientationBinningKernel kernels to compute
+    _num_block_norm_kernel  = input_block_norm.size(); // Number of CLHOGBlockNormalizationKernel kernels to compute
+    _num_hog_detect_kernel  = input_hog_detect.size(); // Number of CLHOGDetector functions to compute
+
+    _orient_bin_kernel = arm_compute::cpp14::make_unique<CLHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
+    _block_norm_kernel = arm_compute::cpp14::make_unique<CLHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
+    _hog_detect_kernel = arm_compute::cpp14::make_unique<CLHOGDetector[]>(_num_hog_detect_kernel);
+    _non_maxima_kernel = arm_compute::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
+    _hog_space         = arm_compute::cpp14::make_unique<CLTensor[]>(_num_orient_bin_kernel);
+    _hog_norm_space    = arm_compute::cpp14::make_unique<CLTensor[]>(_num_block_norm_kernel);
+
+    // Allocate tensors for magnitude and phase
+    TensorInfo info_mag(shape_img, Format::S16);
+    _mag.allocator()->init(info_mag);
+
+    TensorInfo info_phase(shape_img, Format::U8);
+    _phase.allocator()->init(info_phase);
+
+    // Initialise gradient kernel
+    _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
+
+    // Configure NETensor for the HOG space and orientation binning kernel
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        const size_t idx_multi_hog = input_orient_bin[i];
+
+        // Get the corresponding cell size and number of bins
+        const Size2D &cell     = multi_hog->model(idx_multi_hog)->info()->cell_size();
+        const size_t  num_bins = multi_hog->model(idx_multi_hog)->info()->num_bins();
+
+        // Calculate number of cells along the x and y directions for the hog_space
+        const size_t num_cells_x = width / cell.width;
+        const size_t num_cells_y = height / cell.height;
+
+        // TensorShape of hog space
+        TensorShape shape_hog_space = input->info()->tensor_shape();
+        shape_hog_space.set(Window::DimX, num_cells_x);
+        shape_hog_space.set(Window::DimY, num_cells_y);
+
+        // Allocate HOG space
+        TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
+        _hog_space[i].allocator()->init(info_space);
+
+        // Initialise orientation binning kernel
+        _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+    }
+
+    // Configure CLTensor for the normalized HOG space and block normalization kernel
+    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    {
+        const size_t idx_multi_hog  = input_block_norm[i].first;
+        const size_t idx_orient_bin = input_block_norm[i].second;
+
+        // Allocate normalized HOG space
+        TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
+        _hog_norm_space[i].allocator()->init(tensor_info);
+
+        // Initialize block normalization kernel
+        _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+    }
+
+    detection_window_strides->map(CLScheduler::get().queue(), true);
+
+    // Configure HOG detector kernel
+    for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
+    {
+        const size_t idx_block_norm = input_hog_detect[i];
+
+        _hog_detect_kernel[i].configure(_hog_norm_space.get() + idx_block_norm, multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i);
+    }
+
+    detection_window_strides->unmap(CLScheduler::get().queue());
+
+    // Configure non maxima suppression kernel
+    _non_maxima_kernel->configure(_detection_windows, min_distance);
+
+    // Allocate intermediate tensors
+    _mag.allocator()->allocate();
+    _phase.allocator()->allocate();
+
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        _hog_space[i].allocator()->allocate();
+    }
+
+    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    {
+        _hog_norm_space[i].allocator()->allocate();
+    }
+}
+
+void CLHOGMultiDetection::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
+
+    // Reset detection window
+    _detection_windows->clear();
+
+    // Run gradient
+    _gradient_kernel.run();
+
+    // Run orientation binning kernel
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        CLScheduler::get().enqueue(*(_orient_bin_kernel.get() + i), false);
+    }
+
+    // Run block normalization kernel
+    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    {
+        CLScheduler::get().enqueue(*(_block_norm_kernel.get() + i), false);
+    }
+
+    // Run HOG detector kernel
+    for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
+    {
+        _hog_detect_kernel[i].run();
+    }
+
+    // Run non-maxima suppression kernel if enabled
+    if(_non_maxima_suppression)
+    {
+        // Map detection windows array before computing non maxima suppression
+        _detection_windows->map(CLScheduler::get().queue(), true);
+        _non_maxima_kernel->run(_non_maxima_kernel->window());
+        _detection_windows->unmap(CLScheduler::get().queue());
+    }
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
index 6501da3..2db277f 100644
--- a/src/runtime/CL/functions/CLHarrisCorners.cpp
+++ b/src/runtime/CL/functions/CLHarrisCorners.cpp
@@ -34,8 +34,8 @@
 #include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
 #include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
 #include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
-#include "arm_compute/runtime/CPP/CPPScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
+#include "arm_compute/runtime/Scheduler.h"
 
 #include <cmath>
 #include <utility>
@@ -148,7 +148,7 @@
 
     // Run corner candidate kernel
     _nonmax.map(true);
-    CPPScheduler::get().multithread(&_candidates);
+    Scheduler::get().schedule(&_candidates, Window::DimY);
     _nonmax.unmap();
 
     _corners->map(CLScheduler::get().queue(), true);
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
new file mode 100644
index 0000000..263fb51
--- /dev/null
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+CLLocallyConnectedLayer::CLLocallyConnectedLayer()
+    : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false)
+{
+}
+
+void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 2);
+    }
+
+    bool _has_bias = (biases != nullptr);
+    _is_first_run  = true;
+
+    // Get parameters for conv_info
+    unsigned int stride_x = 0;
+    unsigned int stride_y = 0;
+    unsigned int pad_x    = 0;
+    unsigned int pad_y    = 0;
+    std::tie(stride_x, stride_y) = conv_info.stride();
+    std::tie(pad_x, pad_y)       = conv_info.pad();
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
+                                                 stride_x, stride_y, pad_x, pad_y, conv_info.round());
+
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+
+    // Create tensor to store the reshaped weights
+    const size_t mat_weights_cols = weights->info()->dimension(3);
+    const size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
+    const size_t mat_weights_num  = weights->info()->dimension(4);
+
+    const TensorShape shape_wr(mat_weights_cols, mat_weights_rows, mat_weights_num);
+
+    _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
+
+    // Create tensor to store im2col reshaped inputs
+    const size_t mat_input_cols = mat_weights_rows;
+    const size_t mat_input_rows = conv_w * conv_h;
+    TensorShape  shape_im2col   = input->info()->tensor_shape();
+    shape_im2col.set(0, mat_input_cols);
+    shape_im2col.set(1, mat_input_rows);
+    shape_im2col.set(2, 1);
+
+    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+
+    // Create locally connected layer output tensor
+    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+    shape_gemm.set(0, mat_weights_cols);
+    shape_gemm.set(1, mat_input_rows);
+    _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
+
+    // Configure kernels
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+    _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+    _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
+    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+
+    // Allocate intermediate tensors
+    _weights_reshaped.allocator()->allocate();
+    _input_im2col_reshaped.allocator()->allocate();
+    _gemm_output.allocator()->allocate();
+}
+
+void CLLocallyConnectedLayer::run()
+{
+    // Run weights reshaping (Runs once for every configure)
+    if(_is_first_run)
+    {
+        _is_first_run = false;
+        CLScheduler::get().enqueue(_weights_reshape_kernel);
+    }
+
+    // Run input reshaping
+    CLScheduler::get().enqueue(_input_im2col_kernel);
+
+    // Runs vector matrix multiply on reshaped matrices
+    CLScheduler::get().enqueue(_mm_kernel);
+
+    // Reshape output matrix
+    CLScheduler::get().enqueue(_output_col2im_kernel, false);
+}
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 210dbb7..8869330 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -35,14 +35,6 @@
 
 using namespace arm_compute;
 
-#ifdef NO_MULTI_THREADING
-namespace
-{
-void delete_threads(Thread *t)
-{
-}
-}
-#else  /* NO_MULTI_THREADING */
 class arm_compute::Thread
 {
 public:
@@ -162,7 +154,6 @@
     delete[] t;
 }
 } // namespace
-#endif /* NO_MULTI_THREADING */
 
 CPPScheduler &CPPScheduler::get()
 {
@@ -170,49 +161,39 @@
     return scheduler;
 }
 
+unsigned int CPPScheduler::num_threads() const
+{
+    return _num_threads;
+}
+
 CPPScheduler::CPPScheduler()
-    : _num_threads(0), _threads(nullptr, delete_threads)
+    : _num_threads(std::thread::hardware_concurrency()),
+      _threads(std::unique_ptr<Thread[], void(*)(Thread *)>(new Thread[std::thread::hardware_concurrency() - 1], delete_threads))
 {
-    force_number_of_threads(0);
 }
 
-void CPPScheduler::force_number_of_threads(int num_threads)
+void CPPScheduler::set_num_threads(unsigned int num_threads)
 {
-#ifdef NO_MULTI_THREADING
-    ARM_COMPUTE_ERROR_ON(num_threads > 1);
-    _num_threads = 1;
-#else  /* NO_MULTI_THREADING */
-    _num_threads = num_threads > 0 ? num_threads : std::thread::hardware_concurrency();
-    ARM_COMPUTE_ERROR_ON(_num_threads < 1);
-
-    if(_num_threads > 1)
-    {
-        _threads = std::unique_ptr<Thread[], void (*)(Thread *)>(new Thread[_num_threads - 1], delete_threads);
-    }
-    else
-    {
-        _threads = nullptr;
-    }
-#endif /* NO_MULTI_THREADING */
+    const unsigned int num_cores = std::thread::hardware_concurrency();
+    _num_threads                 = num_threads == 0 ? num_cores : num_threads;
 }
 
-void CPPScheduler::multithread(ICPPKernel *kernel, const size_t split_dimension)
+void CPPScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
 
     /** [Scheduler example] */
-    const Window &max_window     = kernel->window();
-    const int     num_iterations = max_window.num_iterations(split_dimension);
-    int           num_threads    = std::min(num_iterations, _num_threads);
+    const Window      &max_window     = kernel->window();
+    const unsigned int num_iterations = max_window.num_iterations(split_dimension);
+    const unsigned int num_threads    = std::min(num_iterations, _num_threads);
 
     if(!kernel->is_parallelisable() || 1 == num_threads)
     {
         kernel->run(max_window);
     }
-#ifndef NO_MULTI_THREADING
     else
     {
-        for(int t = 0; t < num_threads; ++t)
+        for(unsigned int t = 0; t < num_threads; ++t)
         {
             Window win = max_window.split_window(split_dimension, t, num_threads);
             win.set_thread_id(t);
@@ -230,7 +211,7 @@
 
         try
         {
-            for(int t = 1; t < num_threads; ++t)
+            for(unsigned int t = 1; t < num_threads; ++t)
             {
                 _threads[t - 1].wait();
             }
@@ -240,6 +221,5 @@
             std::cout << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
         }
     }
-#endif /* NO_MULTI_THREADING */
     /** [Scheduler example] */
 }
diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
new file mode 100644
index 0000000..f086813
--- /dev/null
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/SingleThreadScheduler.h"
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+
+using namespace arm_compute;
+
+SingleThreadScheduler &SingleThreadScheduler::get()
+{
+    static SingleThreadScheduler scheduler;
+    return scheduler;
+}
+
+void SingleThreadScheduler::set_num_threads(unsigned int num_threads)
+{
+    ARM_COMPUTE_UNUSED(num_threads);
+}
+
+void SingleThreadScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
+{
+    ARM_COMPUTE_UNUSED(split_dimension);
+    kernel->run(kernel->window());
+}
+
+unsigned int SingleThreadScheduler::num_threads() const
+{
+    return 1;
+}
diff --git a/src/runtime/NEON/INESimpleFunction.cpp b/src/runtime/NEON/INESimpleFunction.cpp
index c99d59b..6f0da85 100644
--- a/src/runtime/NEON/INESimpleFunction.cpp
+++ b/src/runtime/NEON/INESimpleFunction.cpp
@@ -35,5 +35,5 @@
 void INESimpleFunction::run()
 {
     _border_handler.run(_border_handler.window());
-    NEScheduler::get().multithread(_kernel.get());
+    NEScheduler::get().schedule(_kernel.get(), Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
new file mode 100644
index 0000000..a24429c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEBatchNormalizationLayer::NEBatchNormalizationLayer()
+    : _norm_kernel()
+{
+}
+
+void NEBatchNormalizationLayer::configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
+{
+    // Configure kernel
+    _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
+}
+
+void NEBatchNormalizationLayer::run()
+{
+    NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index 2d7ad86..26f31f5 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -153,10 +153,10 @@
     _border_mag_gradient.run(_border_mag_gradient.window());
 
     // Run gradient
-    NEScheduler::get().multithread(_gradient.get());
+    NEScheduler::get().schedule(_gradient.get(), Window::DimY);
 
     // Run non-maxima suppression
-    NEScheduler::get().multithread(&_non_max_suppr);
+    NEScheduler::get().schedule(&_non_max_suppr, Window::DimY);
 
     ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
     memset(_output->buffer(), 0, _output->info()->total_size());
diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp
index c2b3d7a..3f39ae2 100644
--- a/src/runtime/NEON/functions/NEConvolution.cpp
+++ b/src/runtime/NEON/functions/NEConvolution.cpp
@@ -98,12 +98,12 @@
 
     if(_is_separable)
     {
-        NEScheduler::get().multithread(&_kernel_hor);
-        NEScheduler::get().multithread(&_kernel_vert);
+        NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
+        NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
     }
     else
     {
-        NEScheduler::get().multithread(&_kernel);
+        NEScheduler::get().schedule(&_kernel, Window::DimY);
     }
 }
 
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index aae4a67..bd688cf 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -33,33 +33,93 @@
 
 using namespace arm_compute;
 
-NEConvolutionLayer::NEConvolutionLayer()
-    : _input_im2col_kernel(), _input_interleave_kernel(), _weights_reshape_kernel(), _weights_transposed_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(),
-      _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _is_first_run(false), _has_bias(false)
+NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights()
+    : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
 {
 }
 
-void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose1xW)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
 
     if(biases != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases);
         ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
         ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
     }
 
-    _has_bias     = (biases != nullptr);
-    _is_first_run = true;
+    // Check if bias are present, if yes they will be embedded to the weights matrix
+    const bool _has_bias = (biases != nullptr);
 
-    // Get parameters for conv_info
+    _transpose1xW = transpose1xW;
+
+    if(transpose1xW)
+    {
+        // Create tensor to store the reshaped weights
+        const unsigned int mat_weights_cols = weights->info()->dimension(3);
+        const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+        TensorShape        shape_wr(mat_weights_cols, mat_weights_rows);
+        TensorInfo         info_wr(shape_wr, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
+
+        _weights_reshaped.allocator()->init(info_wr);
+        _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+        _weights_transposed_kernel.configure(&_weights_reshaped, output);
+        _weights_reshaped.allocator()->allocate();
+    }
+    else
+    {
+        _weights_reshape_kernel.configure(weights, biases, output);
+    }
+}
+
+void NEConvolutionLayerReshapeWeights::run()
+{
+    NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+    if(_transpose1xW)
+    {
+        NEScheduler::get().schedule(&_weights_transposed_kernel, Window::DimY);
+    }
+}
+
+NEConvolutionLayer::NEConvolutionLayer()
+    : _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(),
+      _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+{
+}
+
+void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
+        ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    _has_bias             = (biases != nullptr);
+    _are_weights_reshaped = weights_info.are_reshaped();
+
+    // Get parameters from conv_info
     unsigned int stride_x = 0;
     unsigned int stride_y = 0;
     unsigned int pad_x    = 0;
@@ -70,21 +130,46 @@
     // Get convolved dimensions
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
+
+    const unsigned int kernel_width = (_are_weights_reshaped) ? weights_info.kernel_size() : weights->info()->dimension(0);
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width,
                                                  stride_x, stride_y, pad_x, pad_y, conv_info.round());
     ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
 
-    // Create tensor to store the reshaped weights
-    const unsigned int mat_weights_cols = weights->info()->dimension(3);
-    const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
-    TensorShape        shape_wr(mat_weights_cols, mat_weights_rows);
-    TensorInfo         info_wr(shape_wr, 1, weights->info()->data_type());
-    _weights_reshaped.allocator()->init(info_wr);
+    // Check if its a "fully connected" convolution
+    _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
 
-    // Create tensor to store transposed weights
-    TensorShape shape_wt(mat_weights_rows * 4, static_cast<unsigned int>(std::ceil(mat_weights_cols / 4.f)));
-    TensorInfo  info_wt(shape_wt, 1, weights->info()->data_type());
-    _weights_transposed.allocator()->init(info_wt);
+    unsigned int mat_weights_cols = weights->info()->dimension(3);
+    unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+
+    // Reshape weights if needed
+    if(_are_weights_reshaped)
+    {
+        mat_weights_cols                         = output->info()->dimension(2);
+        const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
+        mat_weights_rows                         = (_has_bias ? 1 + quarter_reshaped_cols : quarter_reshaped_cols);
+    }
+    else
+    {
+        if(_is_fully_connected_convolution)
+        {
+            // Create tensor to store the reshaped weights
+            TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
+            TensorInfo  info_wr(shape_wr, 1, dt, fixed_point_position);
+            _weights_reshaped.allocator()->init(info_wr);
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
+        }
+        else
+        {
+            // Create tensor to store transposed weights
+            const float transpose_width = 16.0f / input->info()->element_size();
+            TensorShape shape_wt(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
+            TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+            _weights_reshaped.allocator()->init(info_wt);
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, true /* 1xW transpose */);
+        }
+        weights = &_weights_reshaped;
+    }
 
     // Create tensor to store im2col reshaped inputs
     const unsigned int mat_input_cols = mat_weights_rows;
@@ -93,58 +178,69 @@
     shape_im2col.set(0, mat_input_cols);
     shape_im2col.set(1, mat_input_rows);
     shape_im2col.set(2, 1);
-    TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type());
-    _input_im2col_reshaped.allocator()->init(info_im2col);
+    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
 
-    // Create tensor to prepare input tensor for GEMM
-    TensorShape shape_interleaved = shape_im2col;
-    shape_interleaved.set(0, shape_interleaved.x() * 4);
-    shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
-    TensorInfo info_interleaved(shape_interleaved, 1, input->info()->data_type());
-    _input_interleaved_reshaped.allocator()->init(info_interleaved);
+    // Create tensor (interleave) to prepare input tensor for GEMM
+    if(!_is_fully_connected_convolution)
+    {
+        TensorShape shape_interleaved = shape_im2col;
+        shape_interleaved.set(0, shape_interleaved.x() * 4);
+        shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
+        _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+    }
 
     // Create GEMM output tensor
     TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
     shape_gemm.set(0, mat_weights_cols);
     shape_gemm.set(1, mat_input_rows);
-    TensorInfo info_gemm(shape_gemm, 1, input->info()->data_type());
-    _gemm_output.allocator()->init(info_gemm);
+    _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, dt, fixed_point_position));
 
     // Configure kernels
     _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
-    _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
-    _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
-    _weights_transposed_kernel.configure(&_weights_reshaped, &_weights_transposed);
-    _mm_kernel.configure(&_input_interleaved_reshaped, &_weights_transposed, &_gemm_output, 1.0f);
+    if(_is_fully_connected_convolution)
+    {
+        _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
+    }
+    else
+    {
+        _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
+        _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
+    }
     _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
 
-    // Allocate the tensors once the all configure methods have been called
-    _weights_reshaped.allocator()->allocate();
-    _weights_transposed.allocator()->allocate();
+    // Allocate intermediate tensor
+    if(!_are_weights_reshaped)
+    {
+        _weights_reshaped.allocator()->allocate();
+    }
     _input_im2col_reshaped.allocator()->allocate();
-    _input_interleaved_reshaped.allocator()->allocate();
+    if(!_is_fully_connected_convolution)
+    {
+        _input_interleaved_reshaped.allocator()->allocate();
+    }
     _gemm_output.allocator()->allocate();
 }
 
 void NEConvolutionLayer::run()
 {
     // Run weights reshaping (Runs once for every configure)
-    if(_is_first_run)
+    if(!_are_weights_reshaped)
     {
-        _is_first_run = false;
-        NEScheduler::get().multithread(&_weights_reshape_kernel, 3);
-        NEScheduler::get().multithread(&_weights_transposed_kernel);
+        _are_weights_reshaped = true;
+        _reshape_weights.run();
     }
 
     // Run input reshaping
-    NEScheduler::get().multithread(&_input_im2col_kernel);
+    NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
+    if(!_is_fully_connected_convolution)
+    {
+        // Run interleave
+        NEScheduler::get().schedule(&_input_interleave_kernel, Window::DimY);
+    }
 
-    // Run interleave
-    NEScheduler::get().multithread(&_input_interleave_kernel);
-
-    // Runs GEMM on reshaped matrices
-    NEScheduler::get().multithread(&_mm_kernel);
+    // Runs matrix multiply on reshaped matrices
+    NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
 
     // Reshape output matrix
-    NEScheduler::get().multithread(&_output_col2im_kernel);
+    NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NEDepthConcatenate.cpp b/src/runtime/NEON/functions/NEDepthConcatenate.cpp
new file mode 100644
index 0000000..7d2c549
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDepthConcatenate.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEDepthConcatenate::NEDepthConcatenate()
+    : _inputs_vector(), _concat_kernels_vector(), _border_handlers_vector(), _num_inputs(0)
+{
+}
+
+void NEDepthConcatenate::configure(std::vector<ITensor *> inputs_vector, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
+
+    _num_inputs             = inputs_vector.size();
+    _concat_kernels_vector  = arm_compute::cpp14::make_unique<NEDepthConcatenateKernel[]>(_num_inputs);
+    _border_handlers_vector = arm_compute::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
+
+    unsigned int depth_offset = 0;
+    for(unsigned int i = 0; i < _num_inputs; ++i)
+    {
+        _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
+        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+        depth_offset += inputs_vector.at(i)->info()->dimension(2);
+    }
+}
+
+void NEDepthConcatenate::run()
+{
+    for(unsigned i = 0; i < _num_inputs; ++i)
+    {
+        NEScheduler::get().schedule(&_border_handlers_vector[i], Window::DimX);
+        NEScheduler::get().schedule(&_concat_kernels_vector[i], Window::DimX);
+    }
+}
diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/NEON/functions/NEDepthConvert.cpp
index 5f3594a..a339cae 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvert.cpp
@@ -33,8 +33,8 @@
 
 void NEDepthConvert::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16, DataType::U32, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::U16, DataType::S16, DataType::U32, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
     ARM_COMPUTE_ERROR_ON(input == output);
     ARM_COMPUTE_ERROR_ON(input->info()->data_type() == output->info()->data_type());
 
diff --git a/src/runtime/NEON/functions/NEDerivative.cpp b/src/runtime/NEON/functions/NEDerivative.cpp
index 340e1ce..2887c13 100644
--- a/src/runtime/NEON/functions/NEDerivative.cpp
+++ b/src/runtime/NEON/functions/NEDerivative.cpp
@@ -48,5 +48,5 @@
 void NEDerivative::run()
 {
     _border_handler.run(_border_handler.window());
-    NEScheduler::get().multithread(&_kernel);
+    NEScheduler::get().schedule(&_kernel, Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
new file mode 100644
index 0000000..3f3e771
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+NEDirectConvolutionLayer::NEDirectConvolutionLayer()
+    : _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator()
+{
+}
+
+void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+
+    // Free accumulator
+    if(_accumulator.buffer() != nullptr)
+    {
+        _accumulator.allocator()->free();
+    }
+
+    // Allocate the intermediate accumulator tensor in case of fixed point input
+    if(output->info()->data_type() == DataType::QS8)
+    {
+        _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position()));
+        _conv_kernel.configure(input, weights, &_accumulator, conv_info);
+        _accumulate_bias_kernel.configure(&_accumulator, bias, output);
+        _accumulator.allocator()->allocate();
+    }
+    else
+    {
+        _conv_kernel.configure(input, weights, output, conv_info);
+        _accumulate_bias_kernel.configure(output, bias);
+    }
+
+    // Add zero padding XY
+    _input_border_handler.configure(input, _conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+}
+
+void NEDirectConvolutionLayer::run()
+{
+    _input_border_handler.run(_input_border_handler.window());
+
+    NEScheduler::get().schedule(&_conv_kernel, Window::DimZ);
+    NEScheduler::get().schedule(&_accumulate_bias_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
index a8b132d..f6ec677 100644
--- a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
+++ b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
@@ -52,11 +52,11 @@
 void NEEqualizeHistogram::run()
 {
     // Calculate histogram of input.
-    NEScheduler::get().multithread(&_histogram_kernel);
+    NEScheduler::get().schedule(&_histogram_kernel, Window::DimY);
 
     // Calculate cumulative distribution of histogram and create LUT.
     _cd_histogram_kernel.run(_cd_histogram_kernel.window());
 
     // Map input to output using created LUT.
-    NEScheduler::get().multithread(&_map_histogram_kernel);
+    NEScheduler::get().schedule(&_map_histogram_kernel, Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NEFastCorners.cpp b/src/runtime/NEON/functions/NEFastCorners.cpp
index 670b4d4..33a58f1 100644
--- a/src/runtime/NEON/functions/NEFastCorners.cpp
+++ b/src/runtime/NEON/functions/NEFastCorners.cpp
@@ -90,12 +90,12 @@
 {
     _border_handler.run(_border_handler.window());
 
-    NEScheduler::get().multithread(&_fast_corners_kernel);
+    NEScheduler::get().schedule(&_fast_corners_kernel, Window::DimY);
 
     if(_non_max)
     {
-        NEScheduler::get().multithread(&_nonmax_kernel);
+        NEScheduler::get().schedule(&_nonmax_kernel, Window::DimY);
     }
 
-    NEScheduler::get().multithread(&_fill_kernel);
+    NEScheduler::get().schedule(&_fill_kernel, Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp
index 7ff8f2f..e884f4a 100644
--- a/src/runtime/NEON/functions/NEFillBorder.cpp
+++ b/src/runtime/NEON/functions/NEFillBorder.cpp
@@ -35,5 +35,5 @@
 
 void NEFillBorder::run()
 {
-    NEScheduler::get().multithread(&_border_handler, Window::DimZ);
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
 }
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index e6785b3..abb41e9 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -31,35 +31,99 @@
 
 using namespace arm_compute;
 
+NEFullyConnectedLayerReshapeWeights::NEFullyConnectedLayerReshapeWeights()
+    : _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
+{
+}
+
+void NEFullyConnectedLayerReshapeWeights::configure(const ITensor *input, ITensor *output, bool transpose_weights, bool is_batched_fc_layer)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != 2);
+    ARM_COMPUTE_ERROR_ON((transpose_weights == false) && (is_batched_fc_layer == false));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    _transpose_weights   = transpose_weights;
+    _is_batched_fc_layer = is_batched_fc_layer;
+
+    // Check if we need to transpose the weights
+    if(_transpose_weights)
+    {
+        if(_is_batched_fc_layer)
+        {
+            // Initialize the output tensor for transpose
+            TensorShape shape_transposed(input->info()->dimension(1), input->info()->dimension(0));
+            _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, dt, fixed_point_position));
+            _transpose_kernel.configure(input, &_transpose_output);
+
+            // Configure transpose 1xW kernel
+            _transpose1xW_kernel.configure(&_transpose_output, output);
+
+            // Allocate temporary tensor used for transposing the weights
+            _transpose_output.allocator()->allocate();
+        }
+        else
+        {
+            _transpose_kernel.configure(input, output);
+        }
+    }
+    else
+    {
+        if(_is_batched_fc_layer)
+        {
+            // Configure transpose 1xW kernel
+            _transpose1xW_kernel.configure(input, output);
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("Configuration transpose_weights=false & is_batched_fc_layer=false not supported");
+        }
+    }
+}
+
+void NEFullyConnectedLayerReshapeWeights::run()
+{
+    if(_transpose_weights)
+    {
+        NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+    }
+    if(_is_batched_fc_layer)
+    {
+        NEScheduler::get().schedule(&_transpose1xW_kernel, Window::DimY);
+    }
+}
+
 NEFullyConnectedLayer::NEFullyConnectedLayer()
-    : _im2col_kernel(), _transpose_kernel(), _transpose1xW_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _transpose_output(),
-      _transpose1xW_output(), _is_first_run(true), _transpose_weights(true), _fc_after_conv(false), _batched_fc_layer(false), _accumulate_biases(false)
+    : _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _reshape_weights_output(),
+      _are_weights_reshaped(false), _is_fc_after_conv(false), _is_batched_fc_layer(false), _accumulate_biases(false)
 {
 }
 
 void NEFullyConnectedLayer::configure_conv_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2) * (16 / weights->info()->element_size())));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
 
     // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
 
     // Initialize output tensor for im2col
     TensorShape shape_im2col;
-    shape_im2col.set(0, weights->info()->dimension(1));
+    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
     shape_im2col.set(1, input->info()->dimension(3));
     shape_im2col.set(2, input->info()->dimension(4));
     shape_im2col.set(3, input->info()->dimension(5));
-    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
 
     // Initialize output tensor for interleave 4x4
     TensorShape shape_interleaved = _im2col_output.info()->tensor_shape();
     shape_interleaved.set(0, shape_interleaved.x() * 4);
     shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
-    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
-
-    // Initialize output tensor for transpose 1xW
-    TensorShape shape_transposed1xW(weights->info()->dimension(1) * 4, static_cast<size_t>(std::ceil(weights->info()->dimension(0) / 4.f)));
-    _transpose1xW_output.allocator()->init(TensorInfo(shape_transposed1xW, 1, weights->info()->data_type()));
+    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
 
     // Configure im2col kernel
     _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
@@ -67,55 +131,49 @@
     // Configure interleave4x4 kernel
     _interleave4x4_kernel.configure(&_im2col_output, &_interleave4x4_output);
 
-    // Configure transpose 1xW kernel
-    _transpose1xW_kernel.configure(weights, &_transpose1xW_output);
-
     // Configure matrix multiply kernel
-    _mm_kernel.configure(&_interleave4x4_output, &_transpose1xW_output, output, 1.0f);
+    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
 
     // Allocate the tensors once all the configure methods have been called
     _im2col_output.allocator()->allocate();
     _interleave4x4_output.allocator()->allocate();
-    _transpose1xW_output.allocator()->allocate();
 }
 
 void NEFullyConnectedLayer::configure_fc_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output)
 {
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
     // Initialize output tensor for interleave 4x4
     TensorShape shape_interleaved = input->info()->tensor_shape();
     shape_interleaved.set(0, shape_interleaved.x() * 4);
     shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
-    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
-
-    // Initialize output tensor for transpose 1xW
-    TensorShape shape_transposed1xW(weights->info()->dimension(1) * 4, static_cast<size_t>(std::ceil(weights->info()->dimension(0) / 4.f)));
-    _transpose1xW_output.allocator()->init(TensorInfo(shape_transposed1xW, 1, weights->info()->data_type()));
+    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
 
     // Configure interleave4x4 kernel
     _interleave4x4_kernel.configure(input, &_interleave4x4_output);
 
-    // Configure transpose 1xW kernel
-    _transpose1xW_kernel.configure(weights, &_transpose1xW_output);
-
     // Configure matrix multiply kernel
-    _mm_kernel.configure(&_interleave4x4_output, &_transpose1xW_output, output, 1.0f);
+    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
 
     // Allocate the tensors once all the configure methods have been called
     _interleave4x4_output.allocator()->allocate();
-    _transpose1xW_output.allocator()->allocate();
 }
 
 void NEFullyConnectedLayer::configure_conv_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output)
 {
     ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
 
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
     // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
 
     // Initialize output tensor for im2col
     TensorShape shape_im2col;
-    shape_im2col.set(0, weights->info()->dimension(1));
+    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
     shape_im2col.set(1, 1);
-    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
 
     // Configure im2col kernel
     _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
@@ -135,20 +193,21 @@
     _mm_kernel.configure(input, weights, output, 1.0f);
 }
 
-void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights)
+void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights, bool are_weights_reshaped)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
 
-    _is_first_run      = true;
-    _transpose_weights = transpose_weights;
-    _fc_after_conv     = true;
-    _batched_fc_layer  = false;
-    _accumulate_biases = false;
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
 
-    const ITensor *weights_to_use = weights;
+    _are_weights_reshaped = are_weights_reshaped;
+    _is_fc_after_conv     = true;
+    _is_batched_fc_layer  = false;
+    _accumulate_biases    = false;
 
     if(biases != nullptr)
     {
@@ -160,17 +219,6 @@
         _accumulate_biases_kernel.configure(output, biases);
     }
 
-    // Check if we need to transpose the weights
-    if(_transpose_weights)
-    {
-        // Initialize the output tensor for transpose
-        TensorShape shape_transposed(weights->info()->dimension(1), weights->info()->dimension(0));
-        _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, weights->info()->data_type()));
-        _transpose_kernel.configure(weights, &_transpose_output);
-
-        weights_to_use = &_transpose_output;
-    }
-
     // With the Fully Connected layer we can have 4 different cases:
     //  1) Convolution layer -> Fully Connected layer without batches
     //  2) Fully Connected layer -> Fully Connected layer without batches
@@ -178,15 +226,54 @@
     //  4) Fully Connected layer -> Fully Connected layer with batches
 
     // Check if we have a fully connected layer with batches
-    _batched_fc_layer = (output->info()->dimension(1) > 1);
+    _is_batched_fc_layer = (output->info()->dimension(1) > 1);
 
-    if(_batched_fc_layer)
+    const ITensor *weights_to_use = weights;
+
+    if(!are_weights_reshaped)
     {
-        _fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
-                                                                               input->info()->tensor_shape().cend(),
-                                                                               output->info()->tensor_shape().cbegin() + 1));
+        if((transpose_weights || _is_batched_fc_layer))
+        {
+            weights_to_use = &_reshape_weights_output;
 
-        if(_fc_after_conv)
+            if(transpose_weights)
+            {
+                if(_is_batched_fc_layer)
+                {
+                    const float transpose_width = 16.0f / input->info()->element_size();
+                    TensorShape shape_wt(weights->info()->dimension(0) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(1) / transpose_width)));
+                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                    _reshape_weights_output.allocator()->init(info_wt);
+                }
+                else
+                {
+                    TensorShape shape_wt(weights->info()->dimension(1), weights->info()->dimension(0));
+                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                    _reshape_weights_output.allocator()->init(info_wt);
+                }
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR_ON(!_is_batched_fc_layer);
+
+                const float transpose_width = 16.0f / input->info()->element_size();
+                TensorShape shape_wt(weights->info()->dimension(1) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(0) / transpose_width)));
+                TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                _reshape_weights_output.allocator()->init(info_wt);
+            }
+
+            // Reshape the weights
+            _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
+        }
+    }
+
+    if(_is_batched_fc_layer)
+    {
+        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                                                                  input->info()->tensor_shape().cend(),
+                                                                                  output->info()->tensor_shape().cbegin() + 1));
+
+        if(_is_fc_after_conv)
         {
             // Fully Connected layer after a Convolution Layer with batches
             configure_conv_fc_wb(input, weights_to_use, output);
@@ -199,9 +286,10 @@
     }
     else
     {
-        _fc_after_conv = (weights_to_use->info()->dimension(1) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+        // In case of not batched fully connected layer, the weights will not be reshaped using transposed1xW
+        _is_fc_after_conv = ((weights_to_use->info()->dimension(1)) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
 
-        if(_fc_after_conv)
+        if(_is_fc_after_conv)
         {
             // Fully Connected layer after a Convolution Layer without batches
             configure_conv_fc_nb(input, weights_to_use, output);
@@ -213,47 +301,44 @@
         }
     }
 
-    // Allocate the transpose tensor if the transpose_weights flag is true and once all the configure methods have been called
-    if(_transpose_weights)
+    // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
+    if(!are_weights_reshaped)
     {
-        _transpose_output.allocator()->allocate();
+        if(transpose_weights || _is_batched_fc_layer)
+        {
+            // Allocate the tensor for the weights reshaped
+            _reshape_weights_output.allocator()->allocate();
+        }
     }
 }
 
 void NEFullyConnectedLayer::run()
 {
     // Reshape of the weights (happens only once)
-    if(_is_first_run)
+    if(!_are_weights_reshaped)
     {
-        _is_first_run = false;
-        if(_transpose_weights)
-        {
-            NEScheduler::get().multithread(&_transpose_kernel);
-        }
-        if(_batched_fc_layer)
-        {
-            NEScheduler::get().multithread(&_transpose1xW_kernel);
-        }
+        _are_weights_reshaped = true;
+        _reshape_weights_kernel.run();
     }
 
     // Linearize input if comes from a convolutional layer
-    if(_fc_after_conv)
+    if(_is_fc_after_conv)
     {
-        NEScheduler::get().multithread(&_im2col_kernel);
+        NEScheduler::get().schedule(&_im2col_kernel, Window::DimY);
     }
 
     // Interleave input
-    if(_batched_fc_layer)
+    if(_is_batched_fc_layer)
     {
-        NEScheduler::get().multithread(&_interleave4x4_kernel);
+        NEScheduler::get().schedule(&_interleave4x4_kernel, Window::DimY);
     }
 
     // Run matrix multiply
-    NEScheduler::get().multithread(&_mm_kernel);
+    NEScheduler::get().schedule(&_mm_kernel, _is_batched_fc_layer ? Window::DimY : Window::DimX);
 
     // Accumulate biases if provided
     if(_accumulate_biases)
     {
-        NEScheduler::get().multithread(&_accumulate_biases_kernel);
+        NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
     }
 }
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index f155dd5..15d5f4e 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -43,16 +43,16 @@
 
 void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(d, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::F32, DataType::F16, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(d, 1, DataType::F32, DataType::F16, DataType::QS8);
 
     if(c != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16, DataType::QS8);
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
         ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
-        ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
+        ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
         ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(0) != d->info()->dimension(0), "The C matrix must have the same number of rows as the output matrix");
         ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != d->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
     }
@@ -60,8 +60,8 @@
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, d);
     ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
 
-    // Check if the first input tensor is a vector and the data type is F32. If so, all the kernels for reshaping the tensors can be skipped
-    if((a->info()->dimension(1) == 1) && (a->info()->data_type() == DataType::F32))
+    // Check if the first input tensor is a vector. If so, all the kernels for reshaping the tensors can be skipped
+    if((a->info()->dimension(1) == 1))
     {
         _run_vector_matrix_multiplication = true;
 
@@ -94,14 +94,20 @@
                     break;
                 }
 #endif
+            case DataType::QS8:
+            {
+                shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+                shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.0f));
+                break;
+            }
             default:
             {
                 ARM_COMPUTE_ERROR_ON("Data type not supported");
             }
         }
 
-        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
-        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
+        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), a->info()->fixed_point_position());
 
         _tmp_a.allocator()->init(info_a);
         _tmp_b.allocator()->init(info_b);
@@ -133,18 +139,18 @@
     if(!_run_vector_matrix_multiplication)
     {
         // Run interleave kernel
-        NEScheduler::get().multithread(&_interleave_kernel);
+        NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
 
         // Run transpose kernel
-        NEScheduler::get().multithread(&_transpose_kernel);
+        NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
     }
 
     // Run matrix multiply kernel
-    NEScheduler::get().multithread(&_mm_kernel, _run_vector_matrix_multiplication ? 0 : 1);
+    NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
 
     // Run matrix addition kernel
     if(_run_addition)
     {
-        NEScheduler::get().multithread(&_ma_kernel);
+        NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
     }
 }
diff --git a/src/runtime/NEON/functions/NEGEMMLowp.cpp b/src/runtime/NEON/functions/NEGEMMLowp.cpp
index 3866f28..b64f769 100644
--- a/src/runtime/NEON/functions/NEGEMMLowp.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowp.cpp
@@ -49,14 +49,14 @@
     ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
     ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
 
-    /* The interleaved output matrix will have the following shape: [ a_height * 4, a_width / 4 ] */
+    /* The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] */
     TensorShape shape_tmp_a = a->info()->tensor_shape();
     shape_tmp_a.set(0, a->info()->dimension(0) * 4);
     shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
 
     TensorShape shape_tmp_b = b->info()->tensor_shape();
-    shape_tmp_b.set(0, b->info()->dimension(1) * 4);
-    shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 4.f));
+    shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+    shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
 
     TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
     TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
@@ -74,11 +74,11 @@
 void NEGEMMLowp::run()
 {
     /* Run interleave kernel */
-    NEScheduler::get().multithread(&_interleave_kernel);
+    NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
 
     /* Run transpose kernel */
-    NEScheduler::get().multithread(&_transpose_kernel);
+    NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
 
     /* Run matrix multiply kernel */
-    NEScheduler::get().multithread(&_mm_kernel);
+    NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
index 8cba30d..dc40ece 100644
--- a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
+++ b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
@@ -34,11 +34,6 @@
 
 void NEGEMMTranspose1xW::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(1) * 4);
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != std::ceil(input->info()->dimension(0) / 4.0f));
     auto k = arm_compute::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NEGaussian5x5.cpp b/src/runtime/NEON/functions/NEGaussian5x5.cpp
index 34447b1..5ccc765 100644
--- a/src/runtime/NEON/functions/NEGaussian5x5.cpp
+++ b/src/runtime/NEON/functions/NEGaussian5x5.cpp
@@ -55,6 +55,6 @@
 void NEGaussian5x5::run()
 {
     _border_handler.run(_border_handler.window());
-    NEScheduler::get().multithread(&_kernel_hor);
-    NEScheduler::get().multithread(&_kernel_vert);
+    NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
+    NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
index cb8296b..e1d64f1 100644
--- a/src/runtime/NEON/functions/NEGaussianPyramid.cpp
+++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
@@ -108,8 +108,8 @@
     for(unsigned int i = 0; i < num_levels - 1; ++i)
     {
         _border_handler[i].run(_border_handler[i].window());
-        NEScheduler::get().multithread(_horizontal_reduction.get() + i);
-        NEScheduler::get().multithread(_vertical_reduction.get() + i);
+        NEScheduler::get().schedule(_horizontal_reduction.get() + i, Window::DimY);
+        NEScheduler::get().schedule(_vertical_reduction.get() + i, Window::DimY);
     }
 }
 
@@ -178,6 +178,6 @@
     for(unsigned int i = 0; i < num_levels - 1; ++i)
     {
         _gaus5x5[i].run();
-        NEScheduler::get().multithread(_scale_nearest.get() + i);
+        NEScheduler::get().schedule(_scale_nearest.get() + i, Window::DimY);
     }
 }
diff --git a/src/runtime/NEON/functions/NEHOGDescriptor.cpp b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
index a5073b9..a592f53 100644
--- a/src/runtime/NEON/functions/NEHOGDescriptor.cpp
+++ b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
@@ -92,8 +92,8 @@
     _gradient.run();
 
     // Run orientation binning kernel
-    NEScheduler::get().multithread(&_orient_bin);
+    NEScheduler::get().schedule(&_orient_bin, Window::DimY);
 
     // Run block normalization kernel
-    NEScheduler::get().multithread(&_block_norm);
+    NEScheduler::get().schedule(&_block_norm, Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NEHOGDetector.cpp b/src/runtime/NEON/functions/NEHOGDetector.cpp
index f0d6121..e8ed29d 100644
--- a/src/runtime/NEON/functions/NEHOGDetector.cpp
+++ b/src/runtime/NEON/functions/NEHOGDetector.cpp
@@ -31,8 +31,6 @@
 void NEHOGDetector::configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
 {
     auto k = arm_compute::cpp14::make_unique<NEHOGDetectorKernel>();
-
     k->configure(input, hog, detection_windows, detection_window_stride, threshold, idx_class);
-
     _kernel = std::move(k);
-}
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEHOGGradient.cpp b/src/runtime/NEON/functions/NEHOGGradient.cpp
index c5b37f4..2f4b880 100644
--- a/src/runtime/NEON/functions/NEHOGGradient.cpp
+++ b/src/runtime/NEON/functions/NEHOGGradient.cpp
@@ -76,5 +76,5 @@
     _derivative.run();
 
     // Run magnitude/phase kernel
-    NEScheduler::get().multithread(_mag_phase.get());
+    NEScheduler::get().schedule(_mag_phase.get(), Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
index effa64f..173b8f4 100644
--- a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
+++ b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
@@ -115,7 +115,7 @@
     _orient_bin_kernel = arm_compute::cpp14::make_unique<NEHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
     _block_norm_kernel = arm_compute::cpp14::make_unique<NEHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
     _hog_detect_kernel = arm_compute::cpp14::make_unique<NEHOGDetector[]>(_num_hog_detect_kernel);
-    _non_maxima_kernel = arm_compute::cpp14::make_unique<NEHOGNonMaximaSuppressionKernel>();
+    _non_maxima_kernel = arm_compute::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
     _hog_space         = arm_compute::cpp14::make_unique<Tensor[]>(_num_orient_bin_kernel);
     _hog_norm_space    = arm_compute::cpp14::make_unique<Tensor[]>(_num_block_norm_kernel);
 
@@ -208,13 +208,13 @@
     // Run orientation binning kernel
     for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
     {
-        NEScheduler::get().multithread(_orient_bin_kernel.get() + i);
+        NEScheduler::get().schedule(_orient_bin_kernel.get() + i, Window::DimY);
     }
 
     // Run block normalization kernel
     for(size_t i = 0; i < _num_block_norm_kernel; ++i)
     {
-        NEScheduler::get().multithread(_block_norm_kernel.get() + i);
+        NEScheduler::get().schedule(_block_norm_kernel.get() + i, Window::DimY);
     }
 
     // Run HOG detector kernel
diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
index adefd47..b54fb67 100644
--- a/src/runtime/NEON/functions/NEHarrisCorners.cpp
+++ b/src/runtime/NEON/functions/NEHarrisCorners.cpp
@@ -199,13 +199,13 @@
     _border_gy.run(_border_gy.window());
 
     // Run harris score kernel
-    NEScheduler::get().multithread(_harris_score.get());
+    NEScheduler::get().schedule(_harris_score.get(), Window::DimY);
 
     // Run non-maxima suppression
     _non_max_suppr.run();
 
     // Run corner candidate kernel
-    NEScheduler::get().multithread(&_candidates);
+    NEScheduler::get().schedule(&_candidates, Window::DimY);
 
     // Run sort & euclidean distance
     _sort_euclidean.run(_sort_euclidean.window());
diff --git a/src/runtime/NEON/functions/NEHistogram.cpp b/src/runtime/NEON/functions/NEHistogram.cpp
index 6747f2e..c42b2a5 100644
--- a/src/runtime/NEON/functions/NEHistogram.cpp
+++ b/src/runtime/NEON/functions/NEHistogram.cpp
@@ -54,5 +54,5 @@
 void NEHistogram::run()
 {
     // Calculate histogram of input.
-    NEScheduler::get().multithread(&_histogram_kernel);
+    NEScheduler::get().schedule(&_histogram_kernel, Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
new file mode 100644
index 0000000..85d7ba3
--- /dev/null
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+NELocallyConnectedLayer::NELocallyConnectedLayer()
+    : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false)
+{
+}
+
+void NELocallyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 2);
+    }
+
+    bool _has_bias = (biases != nullptr);
+    _is_first_run  = true;
+
+    // Get parameters for conv_info
+    unsigned int stride_x = 0;
+    unsigned int stride_y = 0;
+    unsigned int pad_x    = 0;
+    unsigned int pad_y    = 0;
+    std::tie(stride_x, stride_y) = conv_info.stride();
+    std::tie(pad_x, pad_y)       = conv_info.pad();
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
+                                                 stride_x, stride_y, pad_x, pad_y, conv_info.round());
+
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+
+    // Create tensor to store the reshaped weights
+    const size_t mat_weights_cols = weights->info()->dimension(3);
+    const size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
+    const size_t mat_weights_num  = weights->info()->dimension(4);
+
+    const TensorShape shape_wr(mat_weights_cols, mat_weights_rows, mat_weights_num);
+
+    _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
+
+    // Create tensor to store im2col reshaped inputs
+    const size_t mat_input_cols = mat_weights_rows;
+    const size_t mat_input_rows = conv_w * conv_h;
+    TensorShape  shape_im2col   = input->info()->tensor_shape();
+    shape_im2col.set(0, mat_input_cols);
+    shape_im2col.set(1, mat_input_rows);
+    shape_im2col.set(2, 1);
+
+    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+
+    // Create locally connected layer output tensor
+    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+    shape_gemm.set(0, mat_weights_cols);
+    shape_gemm.set(1, mat_input_rows);
+    _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
+
+    // Configure kernels
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+    _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+    _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
+    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+
+    // Allocate intermediate tensors
+    _weights_reshaped.allocator()->allocate();
+    _input_im2col_reshaped.allocator()->allocate();
+    _gemm_output.allocator()->allocate();
+}
+
+void NELocallyConnectedLayer::run()
+{
+    // Run weights reshaping (Runs once for every configure)
+    if(_is_first_run)
+    {
+        _is_first_run = false;
+        NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+    }
+
+    // Run input reshaping
+    NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
+
+    // Runs GEMM on reshaped matrices
+    NEScheduler::get().schedule(&_mm_kernel, Window::DimX);
+
+    // Reshape output matrix
+    NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEMeanStdDev.cpp b/src/runtime/NEON/functions/NEMeanStdDev.cpp
index 3fb5769..47143f5 100644
--- a/src/runtime/NEON/functions/NEMeanStdDev.cpp
+++ b/src/runtime/NEON/functions/NEMeanStdDev.cpp
@@ -43,5 +43,5 @@
     _global_sum         = 0;
     _global_sum_squared = 0;
 
-    NEScheduler::get().multithread(&_mean_stddev_kernel);
+    NEScheduler::get().schedule(&_mean_stddev_kernel, Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NEMinMaxLocation.cpp b/src/runtime/NEON/functions/NEMinMaxLocation.cpp
index ba73ef9..cab9200 100644
--- a/src/runtime/NEON/functions/NEMinMaxLocation.cpp
+++ b/src/runtime/NEON/functions/NEMinMaxLocation.cpp
@@ -43,8 +43,8 @@
     _min_max.reset();
 
     /* Run min max kernel */
-    NEScheduler::get().multithread(&_min_max);
+    NEScheduler::get().schedule(&_min_max, Window::DimY);
 
     /* Run min max location */
-    NEScheduler::get().multithread(&_min_max_loc);
+    NEScheduler::get().schedule(&_min_max_loc, Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index ff38e61..69ff325 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -41,7 +41,7 @@
 {
     ARM_COMPUTE_ERROR_ON(input == nullptr);
 
-    TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type());
+    TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
     _input_squared.allocator()->init(tensor_info);
 
     // Configure kernels
@@ -55,7 +55,7 @@
 
 void NENormalizationLayer::run()
 {
-    NEScheduler::get().multithread(&_multiply_kernel);
-    NEScheduler::get().multithread(&_border_handler);
-    NEScheduler::get().multithread(&_norm_kernel);
+    NEScheduler::get().schedule(&_multiply_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_border_handler, Window::DimY);
+    NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NEOpticalFlow.cpp b/src/runtime/NEON/functions/NEOpticalFlow.cpp
index 993153b..49135e4 100644
--- a/src/runtime/NEON/functions/NEOpticalFlow.cpp
+++ b/src/runtime/NEON/functions/NEOpticalFlow.cpp
@@ -113,7 +113,7 @@
         // Run Scharr kernel
         _func_scharr[level - 1].run();
 
-        /* Run Lucas-Kanade kernel */
-        NEScheduler::get().multithread(_kernel_tracker.get() + level - 1, Window::DimX);
+        // Run Lucas-Kanade kernel
+        NEScheduler::get().schedule(_kernel_tracker.get() + level - 1, Window::DimX);
     }
 }
diff --git a/src/runtime/NEON/functions/NESobel5x5.cpp b/src/runtime/NEON/functions/NESobel5x5.cpp
index 1859b30..8967a22 100644
--- a/src/runtime/NEON/functions/NESobel5x5.cpp
+++ b/src/runtime/NEON/functions/NESobel5x5.cpp
@@ -76,6 +76,6 @@
 void NESobel5x5::run()
 {
     _border_handler.run(_border_handler.window());
-    NEScheduler::get().multithread(&_sobel_hor);
-    NEScheduler::get().multithread(&_sobel_vert);
+    NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
+    NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NESobel7x7.cpp b/src/runtime/NEON/functions/NESobel7x7.cpp
index 8af5e8d..f628da9 100644
--- a/src/runtime/NEON/functions/NESobel7x7.cpp
+++ b/src/runtime/NEON/functions/NESobel7x7.cpp
@@ -76,6 +76,6 @@
 void NESobel7x7::run()
 {
     _border_handler.run(_border_handler.window());
-    NEScheduler::get().multithread(&_sobel_hor);
-    NEScheduler::get().multithread(&_sobel_vert);
+    NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
+    NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 55d4d3a..0651eab 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -32,21 +32,22 @@
 using namespace arm_compute;
 
 NESoftmaxLayer::NESoftmaxLayer()
-    : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _fill_border_kernel(), _fill_border_kernel_sum(), _max(), _sum(), _tmp()
+    : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _fill_border_kernel(), _max(), _sum(), _tmp()
 {
 }
 
 void NESoftmaxLayer::configure(ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
 
     // Create intermediate tensors shapes
-    _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type()));
+    TensorInfo tensor_info_tmp(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
+    _tmp.allocator()->init(tensor_info_tmp);
 
     TensorShape shape = input->info()->tensor_shape();
     shape.set(0, 1);
-    TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type());
+    TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
     _max.allocator()->init(tensor_info_max_sum);
     _sum.allocator()->init(tensor_info_max_sum);
 
@@ -55,9 +56,6 @@
     _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
     _norm_kernel.configure(&_tmp, &_sum, output);
     _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::CONSTANT, PixelValue(-FLT_MAX));
-    // Fill the border around tmp buffer with sensible negative value.
-    // This avoids exp(-FLT_MAX) which will lead to -inf and destroy the calculation of sum when input is not a multiple of processed elements
-    _fill_border_kernel_sum.configure(input, _shift_exp_sum_kernel.border_size(), BorderMode::CONSTANT, PixelValue(-50.f));
 
     // Allocate intermediate tensors
     _tmp.allocator()->allocate();
@@ -67,9 +65,8 @@
 
 void NESoftmaxLayer::run()
 {
-    NEScheduler::get().multithread(&_fill_border_kernel);
-    NEScheduler::get().multithread(&_max_kernel);
-    NEScheduler::get().multithread(&_fill_border_kernel_sum);
-    NEScheduler::get().multithread(&_shift_exp_sum_kernel);
-    NEScheduler::get().multithread(&_norm_kernel);
+    NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_max_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_shift_exp_sum_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
 }
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
new file mode 100644
index 0000000..0cced73
--- /dev/null
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/OMP/OMPScheduler.h"
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+
+#include <omp.h>
+
+using namespace arm_compute;
+
+OMPScheduler &OMPScheduler::get()
+{
+    static OMPScheduler scheduler;
+    return scheduler;
+}
+
+OMPScheduler::OMPScheduler()
+    : _num_threads(omp_get_max_threads())
+{
+}
+
+unsigned int OMPScheduler::num_threads() const
+{
+    return _num_threads;
+}
+
+void OMPScheduler::set_num_threads(unsigned int num_threads)
+{
+    const unsigned int num_cores = omp_get_max_threads();
+    _num_threads                 = num_threads == 0 ? num_cores : num_threads;
+}
+
+void OMPScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
+
+    const Window      &max_window     = kernel->window();
+    const unsigned int num_iterations = max_window.num_iterations(split_dimension);
+    const unsigned int num_threads    = std::min(num_iterations, _num_threads);
+
+    if(!kernel->is_parallelisable() || 1 == num_threads)
+    {
+        kernel->run(max_window);
+    }
+    else
+    {
+        #pragma omp parallel num_threads(num_threads)
+        {
+            #pragma omp for
+            for(unsigned int t = 0; t < num_threads; ++t)
+            {
+                Window win = max_window.split_window(split_dimension, t, num_threads);
+                win.set_thread_id(t);
+                win.set_num_threads(num_threads);
+                kernel->run(win);
+            }
+        }
+    }
+}
diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
new file mode 100644
index 0000000..a131928
--- /dev/null
+++ b/src/runtime/Scheduler.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Scheduler.h"
+
+#include "arm_compute/core/Error.h"
+#if ARM_COMPUTE_CPP_SCHEDULER
+#include "arm_compute/runtime/CPP/CPPScheduler.h"
+#endif
+
+#include "arm_compute/runtime/SingleThreadScheduler.h"
+
+#if ARM_COMPUTE_OPENMP_SCHEDULER
+#include "arm_compute/runtime/OMP/OMPScheduler.h"
+#endif
+
+using namespace arm_compute;
+
+#if !ARM_COMPUTE_CPP_SCHEDULER && ARM_COMPUTE_OPENMP_SCHEDULER
+Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::OMP;
+#elif ARM_COMPUTE_CPP_SCHEDULER && !ARM_COMPUTE_OPENMP_SCHEDULER
+Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP;
+#elif ARM_COMPUTE_CPP_SCHEDULER && ARM_COMPUTE_OPENMP_SCHEDULER
+Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP;
+#else
+Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::ST;
+#endif
+
+void Scheduler::set(Type t)
+{
+    ARM_COMPUTE_ERROR_ON(!Scheduler::is_available(t));
+    _scheduler_type = t;
+}
+
+bool Scheduler::is_available(Type t)
+{
+    switch(t)
+    {
+        case Type::ST:
+        {
+            return true;
+        }
+        case Type::CPP:
+        {
+#if ARM_COMPUTE_CPP_SCHEDULER
+            return true;
+#else
+            return false;
+#endif
+        }
+        case Type::OMP:
+        {
+#if ARM_COMPUTE_OPENMP_SCHEDULER
+            return true;
+#else
+            return false;
+#endif
+        }
+        case Type::CUSTOM:
+        {
+            return _custom_scheduler != nullptr;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Invalid Scheduler type");
+            return false;
+        }
+    }
+}
+
+Scheduler::Type Scheduler::get_type()
+{
+    return _scheduler_type;
+}
+
+IScheduler &Scheduler::get()
+{
+    switch(_scheduler_type)
+    {
+        case Type::ST:
+        {
+            return SingleThreadScheduler::get();
+        }
+        case Type::CPP:
+        {
+#if ARM_COMPUTE_CPP_SCHEDULER
+            return CPPScheduler::get();
+#else
+            ARM_COMPUTE_ERROR("Recompile with cppthreads=1 to use C++11 scheduler.");
+#endif
+            break;
+        }
+        case Type::OMP:
+        {
+#if ARM_COMPUTE_OPENMP_SCHEDULER
+            return OMPScheduler::get();
+#else
+            ARM_COMPUTE_ERROR("Recompile with openmp=1 to use openmp scheduler.");
+#endif
+            break;
+        }
+        case Type::CUSTOM:
+        {
+            if(_custom_scheduler == nullptr)
+            {
+                ARM_COMPUTE_ERROR("No custom scheduler has been setup. Call set(std::shared_ptr<IScheduler> &scheduler) before Scheduler::get()");
+            }
+            else
+            {
+                return *_custom_scheduler;
+            }
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Invalid Scheduler type");
+            break;
+        }
+    }
+    return SingleThreadScheduler::get();
+}
+
+std::shared_ptr<IScheduler> Scheduler::_custom_scheduler = nullptr;
+
+void Scheduler::set(std::shared_ptr<IScheduler> &scheduler)
+{
+    _custom_scheduler = scheduler;
+    set(Type::CUSTOM);
+}
diff --git a/src/runtime/SubTensor.cpp b/src/runtime/SubTensor.cpp
new file mode 100644
index 0000000..32924be
--- /dev/null
+++ b/src/runtime/SubTensor.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/SubTensor.h"
+
+#include "arm_compute/core/Error.h"
+
+using namespace arm_compute;
+
+SubTensor::SubTensor(ITensor *parent, const TensorShape &tensor_shape, const Coordinates &coords)
+    : _parent(nullptr), _info()
+{
+    ARM_COMPUTE_ERROR_ON(parent == nullptr);
+    _info   = SubTensorInfo(parent->info(), tensor_shape, coords);
+    _parent = parent;
+}
+
+ITensorInfo *SubTensor::info() const
+{
+    return &_info;
+}
+
+ITensorInfo *SubTensor::info()
+{
+    return &_info;
+}
+
+uint8_t *SubTensor::buffer() const
+{
+    ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+    return _parent->buffer();
+}
+
+ITensor *SubTensor::parent()
+{
+    return _parent;
+}
diff --git a/src/runtime/Tensor.cpp b/src/runtime/Tensor.cpp
index 617e7a8..435068c 100644
--- a/src/runtime/Tensor.cpp
+++ b/src/runtime/Tensor.cpp
@@ -30,12 +30,12 @@
 {
 }
 
-TensorInfo *Tensor::info() const
+ITensorInfo *Tensor::info() const
 {
     return &_allocator.info();
 }
 
-TensorInfo *Tensor::info()
+ITensorInfo *Tensor::info()
 {
     return &_allocator.info();
 }
diff --git a/src/runtime/Utils.cpp b/src/runtime/Utils.cpp
new file mode 100644
index 0000000..1b06117
--- /dev/null
+++ b/src/runtime/Utils.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Utils.h"
+
+#include <map>
+#include <string>
+
+using namespace arm_compute;
+
+const std::string &arm_compute::string_from_scheduler_type(Scheduler::Type t)
+{
+    static std::map<Scheduler::Type, const std::string> scheduler_type_map =
+    {
+        { Scheduler::Type::ST, "Single Thread" },
+        { Scheduler::Type::CPP, "C++11 Threads" },
+        { Scheduler::Type::OMP, "OpenMP Threads" },
+        { Scheduler::Type::CUSTOM, "Custom" }
+    };
+
+    return scheduler_type_map[t];
+}