arm_compute v17.09 Change-Id: I4bf8f4e6e5f84ce0d5b6f5ba570d276879f42a81

commit: 8938bd3f40ea62ff56d6ed4e2db0a8aee34dd64a [log] [tgz]
author: Kaizen <kaizen@arm.com> Thu Sep 28 14:38:23 2017 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> Thu Sep 28 16:31:13 2017 +0100
tree: c234331232f227e0cdfb567a54ecaa5460aaa064
parent: f4a254c2745aeaab6f7276a675147d707002fe7a [diff]
diff --git a/src/core/AccessWindowStatic.cpp b/src/core/AccessWindowStatic.cpp
index 8b6419c..8a7f37a 100644
--- a/src/core/AccessWindowStatic.cpp
+++ b/src/core/AccessWindowStatic.cpp

@@ -194,8 +194,8 @@
     PaddingSize padding;
     padding.left   = std::max(0, -_start_x);
     padding.right  = std::max<int>(0, _end_x - shape[0]);
-    padding.top    = shape.num_dimensions() == 1 ? 0 : std::max(0, -_start_y);
-    padding.bottom = shape.num_dimensions() == 1 ? 0 : std::max<int>(0, _end_y - shape[1]);
+    padding.top    = std::max(0, -_start_y);
+    padding.bottom = std::max<int>(0, _end_y - shape[1]);
 
     // Update strides in tensor info
     return _info->extend_padding(padding);

diff --git a/src/core/AccessWindowTranspose.cpp b/src/core/AccessWindowTranspose.cpp
index b3605c4..b104330 100644
--- a/src/core/AccessWindowTranspose.cpp
+++ b/src/core/AccessWindowTranspose.cpp

@@ -66,8 +66,8 @@
     // a size of the region.
     // As the relation between input and output is transposed window.y() is
     // used for x shape and window.x() for y shape.
-    shape.set(0, std::min<int>(old_anchor[1] + old_shape[1] - border_size.right, (window.y().end() - window.y().step()) * _scale_x + _width) - anchor[0]);
-    shape.set(1, std::min<int>(old_anchor[0] + old_shape[0] - border_size.bottom, (window.x().end() - window.x().step()) * _scale_y + _height) - anchor[1]);
+    shape.set(0, std::min<int>((old_anchor[1] + old_shape[1]) * _scale_x - border_size.right, (window.y().end() - window.y().step()) * _scale_x + _width) - anchor[0]);
+    shape.set(1, std::min<int>((old_anchor[0] + old_shape[0]) * _scale_y - border_size.bottom, (window.x().end() - window.x().step()) * _scale_y + _height) - anchor[1]);
 
     // For higher dimensions use the intersection of the window size and the
     // valid region of the input

diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 21b72dd..821fb4c 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp

@@ -27,40 +27,36 @@
 #include "arm_compute/core/Types.h"
 
 #include <map>
+#include <regex>
 #include <vector>
 
 namespace
 {
-arm_compute::GPUTarget get_bifrost_target(const std::string &name)
+arm_compute::GPUTarget get_bifrost_target(const std::string &version)
 {
-    arm_compute::GPUTarget target = arm_compute::GPUTarget::MIDGARD;
-
-    if(name == "G7")
+    if(version == "70")
     {
-        target = arm_compute::GPUTarget::G70;
+        return arm_compute::GPUTarget::G70;
     }
-
-    return target;
+    else
+    {
+        return arm_compute::GPUTarget::BIFROST;
+    }
 }
 
-arm_compute::GPUTarget get_midgard_target(const std::string &name)
+arm_compute::GPUTarget get_midgard_target(const std::string &version)
 {
-    arm_compute::GPUTarget target = arm_compute::GPUTarget::MIDGARD;
-
-    if(name == "T6")
+    switch(version[0])
     {
-        target = arm_compute::GPUTarget::T600;
+        case '6':
+            return arm_compute::GPUTarget::T600;
+        case '7':
+            return arm_compute::GPUTarget::T700;
+        case '8':
+            return arm_compute::GPUTarget::T800;
+        default:
+            return arm_compute::GPUTarget::MIDGARD;
     }
-    else if(name == "T7")
-    {
-        target = arm_compute::GPUTarget::T700;
-    }
-    else if(name == "T8")
-    {
-        target = arm_compute::GPUTarget::T800;
-    }
-
-    return target;
 }
 } // namespace
 
@@ -72,16 +68,22 @@
     {
         case DataType::U8:
             return "uchar";
+        case DataType::QS8:
+            return "qs8";
         case DataType::S8:
             return "char";
         case DataType::U16:
             return "ushort";
         case DataType::S16:
             return "short";
+        case DataType::QS16:
+            return "qs16";
         case DataType::U32:
             return "uint";
         case DataType::S32:
             return "int";
+        case DataType::QS32:
+            return "qs32";
         case DataType::U64:
             return "ulong";
         case DataType::S64:
@@ -96,6 +98,47 @@
     }
 }
 
+std::string get_data_size_from_data_type(const DataType &dt)
+{
+    switch(dt)
+    {
+        case DataType::U8:
+        case DataType::QS8:
+        case DataType::S8:
+            return "8";
+        case DataType::U16:
+        case DataType::S16:
+        case DataType::QS16:
+        case DataType::F16:
+            return "16";
+        case DataType::U32:
+        case DataType::S32:
+        case DataType::F32:
+            return "32";
+        case DataType::U64:
+        case DataType::S64:
+            return "64";
+        default:
+            ARM_COMPUTE_ERROR("Unsupported input data type.");
+            return "0";
+    }
+}
+
+std::string get_underlying_cl_type_from_data_type(const DataType &dt)
+{
+    switch(dt)
+    {
+        case DataType::QS8:
+            return "char";
+        case DataType::QS16:
+            return "short";
+        case DataType::QS32:
+            return "int";
+        default:
+            return get_cl_type_from_data_type(dt);
+    }
+}
+
 const std::string &string_from_target(GPUTarget target)
 {
     static std::map<GPUTarget, const std::string> gpu_target_map =
@@ -113,53 +156,104 @@
 
 GPUTarget get_target_from_device(cl::Device &device)
 {
-    const std::string name_mali("Mali-");
-    GPUTarget         target{ GPUTarget::MIDGARD };
-
-    size_t            name_size = 0;
-    std::vector<char> name;
+    size_t name_size = 0;
 
     // Query device name size
     cl_int err = clGetDeviceInfo(device.get(), CL_DEVICE_NAME, 0, nullptr, &name_size);
     ARM_COMPUTE_ERROR_ON_MSG((err != 0) || (name_size == 0), "clGetDeviceInfo failed to return valid information");
-    // Resize vector
-    name.resize(name_size);
+    ARM_COMPUTE_UNUSED(err);
+
+    std::vector<char> name_buffer(name_size);
+
     // Query device name
-    err = clGetDeviceInfo(device.get(), CL_DEVICE_NAME, name_size, name.data(), nullptr);
+    err = clGetDeviceInfo(device.get(), CL_DEVICE_NAME, name_size, name_buffer.data(), nullptr);
     ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information");
     ARM_COMPUTE_UNUSED(err);
 
-    std::string name_str(name.begin(), name.end());
-    auto        pos = name_str.find(name_mali);
+    std::regex  mali_regex(R"(Mali-([TG])(\d+))");
+    std::string device_name(name_buffer.begin(), name_buffer.end());
+    std::smatch name_parts;
+    const bool  found_mali = std::regex_search(device_name, name_parts, mali_regex);
 
-    if(pos != std::string::npos)
+    if(!found_mali)
     {
-        ARM_COMPUTE_ERROR_ON_MSG((pos + name_mali.size() + 2) > name_str.size(), "Device name is shorter than expected.");
-        std::string sub_name = name_str.substr(pos + name_mali.size(), 2);
+        ARM_COMPUTE_INFO("Can't find valid Mali GPU. Target is set to MIDGARD.");
+        return GPUTarget::MIDGARD;
+    }
 
-        if(sub_name[0] == 'G')
-        {
-            target = get_bifrost_target(sub_name);
-        }
-        else if(sub_name[0] == 'T')
-        {
-            target = get_midgard_target(sub_name);
-        }
-        else
-        {
+    const char         target  = name_parts.str(1)[0];
+    const std::string &version = name_parts.str(2);
+
+    switch(target)
+    {
+        case 'T':
+            return get_midgard_target(version);
+        case 'G':
+            return get_bifrost_target(version);
+        default:
             ARM_COMPUTE_INFO("Mali GPU unknown. Target is set to the default one.");
-        }
+            return GPUTarget::MIDGARD;
     }
-    else
-    {
-        ARM_COMPUTE_INFO("Can't find valid Mali GPU. Target is set to the default one.");
-    }
-
-    return target;
 }
 
 GPUTarget get_arch_from_target(GPUTarget target)
 {
     return (target & GPUTarget::GPU_ARCH_MASK);
 }
+
+bool non_uniform_workgroup_support(const cl::Device &device)
+{
+    std::vector<char> extension;
+    size_t            extension_size = 0;
+    cl_int            err            = clGetDeviceInfo(device.get(), CL_DEVICE_EXTENSIONS, 0, nullptr, &extension_size);
+    ARM_COMPUTE_ERROR_ON_MSG((err != 0) || (extension_size == 0), "clGetDeviceInfo failed to return valid information");
+    ARM_COMPUTE_UNUSED(err);
+    // Resize vector
+    extension.resize(extension_size);
+    // Query extension
+    err = clGetDeviceInfo(device.get(), CL_DEVICE_EXTENSIONS, extension_size, extension.data(), nullptr);
+    ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information");
+    ARM_COMPUTE_UNUSED(err);
+
+    std::string extension_str(extension.begin(), extension.end());
+    auto        pos = extension_str.find("cl_arm_non_uniform_work_group_size");
+    return (pos != std::string::npos);
+}
+
+CLVersion get_cl_version(const cl::Device &device)
+{
+    std::vector<char> version;
+    size_t            version_size = 0;
+    cl_int            err          = clGetDeviceInfo(device.get(), CL_DEVICE_VERSION, 0, nullptr, &version_size);
+    ARM_COMPUTE_ERROR_ON_MSG((err != 0) || (version_size == 0), "clGetDeviceInfo failed to return valid information");
+    ARM_COMPUTE_UNUSED(err);
+
+    // Resize vector
+    version.resize(version_size);
+    // Query version
+    err = clGetDeviceInfo(device.get(), CL_DEVICE_VERSION, version_size, version.data(), nullptr);
+    ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information");
+    ARM_COMPUTE_UNUSED(err);
+
+    std::string version_str(version.begin(), version.end());
+    if(version_str.find("OpenCL 2") != std::string::npos)
+    {
+        return CLVersion::CL20;
+    }
+    else if(version_str.find("OpenCL 1.2") != std::string::npos)
+    {
+        return CLVersion::CL12;
+    }
+    else if(version_str.find("OpenCL 1.1") != std::string::npos)
+    {
+        return CLVersion::CL11;
+    }
+    else if(version_str.find("OpenCL 1.0") != std::string::npos)
+    {
+        return CLVersion::CL10;
+    }
+
+    return CLVersion::UNKNOWN;
+}
+
 } // namespace arm_compute

diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 15a5d90..e165cf3 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp

@@ -23,9 +23,11 @@
  */
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 
+#include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Utils.h"
 
+#include <algorithm>
 #include <fstream>
 #include <iostream>
 #include <utility>
@@ -142,32 +144,49 @@
     { "copy_plane", "channel_extract.cl" },
     { "copy_planes_3p", "channel_combine.cl" },
     { "copy_to_keypoint", "fast_corners.cl" },
+    { "depthwise_convolution_3x3", "depthwise_convolution.cl" },
+    { "depthwise_im2col", "depthwise_convolution.cl" },
+    { "depthwise_vector_to_tensor", "depthwise_convolution.cl" },
+    { "depthwise_weights_reshape", "depthwise_convolution.cl" },
+    { "dequantization_layer", "dequantization_layer.cl" },
     { "derivative", "derivative.cl" },
     { "dilate", "dilate.cl" },
+    { "direct_convolution1x1", "direct_convolution1x1.cl" },
+    { "direct_convolution1x1_f32_bifrost", "direct_convolution1x1.cl" },
+    { "direct_convolution3x3", "direct_convolution3x3.cl" },
+    { "direct_convolution3x3_f32_bifrost", "direct_convolution3x3.cl" },
+    { "direct_convolution5x5", "direct_convolution5x5.cl" },
+    { "direct_convolution5x5_f32_bifrost", "direct_convolution5x5.cl" },
     { "erode", "erode.cl" },
     { "fast_corners", "fast_corners.cl" },
     { "fill_image_borders_constant", "fill_border.cl" },
     { "fill_image_borders_replicate", "fill_border.cl" },
     { "finalize", "optical_flow_pyramid_lk.cl" },
+    { "floor_layer", "floor.cl" },
     { "gaussian1x5_sub_x", "gaussian_pyramid.cl" },
     { "gaussian5x1_sub_y", "gaussian_pyramid.cl" },
-    { "gemm_accumulate_biases_f16", "gemm.cl" },
-    { "gemm_accumulate_biases_f32", "gemm.cl" },
+    { "gemm_accumulate_biases", "gemm.cl" },
     { "gemm_interleave4x4_8bit", "gemm.cl" },
     { "gemm_interleave4x4_16bit", "gemm.cl" },
     { "gemm_interleave4x4_32bit", "gemm.cl" },
     { "gemm_ma_f16", "gemm.cl" },
     { "gemm_ma_f32", "gemm.cl" },
-    { "gemm_mm_u8", "gemm.cl" },
-    { "gemm_mm_f16", "gemm.cl" },
-    { "gemm_mm_f32_midgard", "gemm.cl" },
-    { "gemm_mm_f32_bifrost", "gemm.cl" },
-    { "gemm_vm_f16", "gemm.cl" },
-    { "gemm_vm_f32", "gemm.cl" },
+    { "gemm_ma_qs8", "gemm.cl" },
+    { "gemm_ma_qs16", "gemm.cl" },
+    { "gemm_mv", "gemv.cl" },
+    { "gemm_mm_interleaved_transposed_u8", "gemm.cl" },
+    { "gemm_mm_interleaved_transposed_f16", "gemm.cl" },
+    { "gemm_mm_interleaved_transposed_f32_midgard", "gemm.cl" },
+    { "gemm_mm_interleaved_transposed_f32_bifrost", "gemm.cl" },
+    { "gemm_mm_interleaved_transposed_qs8", "gemm.cl" },
+    { "gemm_mm_interleaved_transposed_qs16", "gemm.cl" },
+    { "gemm_mm_floating_point", "gemm.cl" },
+    { "gemm_mm_qs8", "gemm.cl" },
+    { "gemm_mm_qs16", "gemm.cl" },
     { "gemm_lc_vm_f32", "gemm.cl" },
-    { "gemm_transpose1x16_u8", "gemm.cl" },
-    { "gemm_transpose1x8_f16", "gemm.cl" },
-    { "gemm_transpose1x4_f32", "gemm.cl" },
+    { "gemm_transpose1x16", "gemm.cl" },
+    { "gemm_transpose1x8", "gemm.cl" },
+    { "gemm_transpose1x4", "gemm.cl" },
     { "harris_score_3x3", "harris_corners.cl" },
     { "harris_score_5x5", "harris_corners.cl" },
     { "harris_score_7x7", "harris_corners.cl" },
@@ -180,6 +199,7 @@
     { "hog_orientation_binning", "hog.cl" },
     { "hysteresis", "canny.cl" },
     { "im2col_generic", "convolution_layer.cl" },
+    { "im2col_kernel3x3_padx0_pady0", "convolution_layer.cl" },
     { "im2col_reduced", "convolution_layer.cl" },
     { "init_level", "optical_flow_pyramid_lk.cl" },
     { "init_level_max", "optical_flow_pyramid_lk.cl" },
@@ -190,12 +210,14 @@
     { "IYUV_to_RGB888_bt709", "color_convert.cl" },
     { "IYUV_to_RGBA8888_bt709", "color_convert.cl" },
     { "IYUV_to_YUV444_bt709", "color_convert.cl" },
+    { "l2_normalize", "l2_normalize.cl" },
     { "lktracker_stage0", "optical_flow_pyramid_lk.cl" },
     { "lktracker_stage1", "optical_flow_pyramid_lk.cl" },
     { "magnitude_phase", "magnitude_phase.cl" },
     { "mean_stddev_accumulate", "mean_stddev.cl" },
     { "minmax", "minmaxloc.cl" },
     { "minmax_border", "minmaxloc.cl" },
+    { "minmax_layer", "minmax_layer.cl" },
     { "minmaxloc", "minmaxloc.cl" },
     { "non_linear_filter_box3x3", "non_linear_filter3x3.cl" },
     { "non_linear_filter_cross3x3", "non_linear_filter3x3.cl" },
@@ -219,8 +241,14 @@
     { "pixelwise_mul_int", "pixelwise_mul_int.cl" },
     { "pooling_layer_2", "pooling_layer.cl" },
     { "pooling_layer_3", "pooling_layer.cl" },
+    { "pooling_layer_3_optimized", "pooling_layer.cl" },
+    { "pooling_layer_7", "pooling_layer.cl" },
+    { "pooling_layer_N", "pooling_layer.cl" },
+    { "quantization_layer", "quantization_layer.cl" },
+    { "reduction_operation", "reduction_operation.cl" },
     { "remap_nearest_neighbour", "remap.cl" },
     { "remap_bilinear", "remap.cl" },
+    { "reshape_layer", "reshape_layer.cl" },
     { "reshape_to_columns", "convolution_layer.cl" },
     { "RGB888_to_IYUV_bt709", "color_convert.cl" },
     { "RGB888_to_NV12_bt709", "color_convert.cl" },
@@ -230,6 +258,7 @@
     { "RGBA8888_to_NV12_bt709", "color_convert.cl" },
     { "RGBA8888_to_RGB888_bt709", "color_convert.cl" },
     { "RGBA8888_to_YUV444_bt709", "color_convert.cl" },
+    { "roi_pooling_layer", "roi_pooling_layer.cl" },
     { "scale_nearest_neighbour", "scale.cl" },
     { "scale_bilinear", "scale.cl" },
     { "scharr3x3", "scharr_filter.cl" },
@@ -333,6 +362,14 @@
 #include "./cl_kernels/depth_convert.clembed"
     },
     {
+        "depthwise_convolution.cl",
+#include "./cl_kernels/depthwise_convolution.clembed"
+    },
+    {
+        "dequantization_layer.cl",
+#include "./cl_kernels/dequantization_layer.clembed"
+    },
+    {
         "derivative.cl",
 #include "./cl_kernels/derivative.clembed"
     },
@@ -341,6 +378,18 @@
 #include "./cl_kernels/dilate.clembed"
     },
     {
+        "direct_convolution1x1.cl",
+#include "./cl_kernels/direct_convolution1x1.clembed"
+    },
+    {
+        "direct_convolution3x3.cl",
+#include "./cl_kernels/direct_convolution3x3.clembed"
+    },
+    {
+        "direct_convolution5x5.cl",
+#include "./cl_kernels/direct_convolution5x5.clembed"
+    },
+    {
         "erode.cl",
 #include "./cl_kernels/erode.clembed"
     },
@@ -353,6 +402,14 @@
 #include "./cl_kernels/fill_border.clembed"
     },
     {
+        "fixed_point.h",
+#include "./cl_kernels/fixed_point.hembed"
+    },
+    {
+        "floor.cl",
+#include "./cl_kernels/floor.clembed"
+    },
+    {
         "gaussian_pyramid.cl",
 #include "./cl_kernels/gaussian_pyramid.clembed"
     },
@@ -361,6 +418,10 @@
 #include "./cl_kernels/gemm.clembed"
     },
     {
+        "gemv.cl",
+#include "./cl_kernels/gemv.clembed"
+    },
+    {
         "harris_corners.cl",
 #include "./cl_kernels/harris_corners.clembed"
     },
@@ -381,6 +442,10 @@
 #include "./cl_kernels/integral_image.clembed"
     },
     {
+        "l2_normalize.cl",
+#include "./cl_kernels/l2_normalize.clembed"
+    },
+    {
         "magnitude_phase.cl",
 #include "./cl_kernels/magnitude_phase.clembed"
     },
@@ -393,6 +458,10 @@
 #include "./cl_kernels/minmaxloc.clembed"
     },
     {
+        "minmax_layer.cl",
+#include "./cl_kernels/minmax_layer.clembed"
+    },
+    {
         "non_linear_filter3x3.cl",
 #include "./cl_kernels/non_linear_filter3x3.clembed"
     },
@@ -433,10 +502,26 @@
 #include "./cl_kernels/pooling_layer.clembed"
     },
     {
+        "quantization_layer.cl",
+#include "./cl_kernels/quantization_layer.clembed"
+    },
+    {
+        "reduction_operation.cl",
+#include "./cl_kernels/reduction_operation.clembed"
+    },
+    {
         "remap.cl",
 #include "./cl_kernels/remap.clembed"
     },
     {
+        "reshape_layer.cl",
+#include "./cl_kernels/reshape_layer.clembed"
+    },
+    {
+        "roi_pooling_layer.cl",
+#include "./cl_kernels/roi_pooling_layer.clembed"
+    },
+    {
         "scale.cl",
 #include "./cl_kernels/scale.clembed"
     },
@@ -479,12 +564,12 @@
     {
         "warp_perspective.cl",
 #include "./cl_kernels/warp_perspective.clembed"
-    }
-#endif
+    },
+#endif /* EMBEDDED_KERNELS */
 };
 
 CLKernelLibrary::CLKernelLibrary()
-    : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
+    : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map(), _max_workgroup_size(0)
 {
 }
 
@@ -504,9 +589,25 @@
         ARM_COMPUTE_ERROR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
     }
 
+    std::string concat_str;
+
+    if(non_uniform_workgroup_support(_device))
+    {
+        concat_str += " -cl-arm-non-uniform-work-group-size ";
+    }
+    else if(get_cl_version(_device) == CLVersion::CL20)
+    {
+        concat_str += " -cl-std=CL2.0 ";
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!");
+    }
+
     // Check if the program has been built before with same build options.
-    const std::string program_name       = kernel_program_it->second;
-    const std::string build_options      = stringify_set(build_options_set);
+    const std::string program_name  = kernel_program_it->second;
+    const std::string build_options = stringify_set(build_options_set) + concat_str;
+
     const std::string built_program_name = program_name + "_" + build_options;
     auto              built_program_it   = _built_programs_map.find(built_program_name);
 
@@ -553,7 +654,7 @@
     }
 
     program = Program(_context, program_name, program_source_it->second);
-#else
+#else  /* EMBEDDED_KERNELS */
     // Check for binary
     std::string source_name = _kernel_path + program_name;
     std::string binary_name = source_name + "bin";
@@ -571,7 +672,7 @@
     {
         ARM_COMPUTE_ERROR("Kernel file %s does not exist.", source_name.c_str());
     }
-#endif
+#endif /* EMBEDDED_KERNELS */
 
     // Insert program to program map
     const auto new_program = _programs_map.emplace(program_name, std::move(program));
@@ -581,7 +682,7 @@
 
 std::string CLKernelLibrary::stringify_set(const StringSet &s) const
 {
-    std::string concat_set = "-cl-arm-non-uniform-work-group-size ";
+    std::string concat_set;
 
 #ifndef EMBEDDED_KERNELS
     concat_set += "-I" + _kernel_path + " ";
@@ -595,3 +696,32 @@
 
     return concat_set;
 }
+
+std::string CLKernelLibrary::get_program_source(const std::string &program_name)
+{
+    const auto program_source_it = _program_source_map.find(program_name);
+
+    if(program_source_it == _program_source_map.end())
+    {
+        ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str());
+    }
+
+    return program_source_it->second;
+}
+
+size_t CLKernelLibrary::max_local_workgroup_size()
+{
+    if(_max_workgroup_size == 0)
+    {
+        size_t err = clGetDeviceInfo(_device.get(), CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &_max_workgroup_size, nullptr);
+        ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information");
+        ARM_COMPUTE_UNUSED(err);
+    }
+
+    return _max_workgroup_size;
+}
+
+cl::NDRange CLKernelLibrary::default_ndrange()
+{
+    return cl::NDRange(std::min<size_t>(_max_workgroup_size, 128u), 1);
+}

diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 7ac0fe3..1e04f00 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp

@@ -31,7 +31,6 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
 
 #include <cstddef>
 
@@ -44,7 +43,10 @@
         return;
     }
 
-    ARM_COMPUTE_ERROR_ON((0 == (window.x().end() - window.x().start())) || (0 == (window.y().end() - window.y().start())));
+    if((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0)
+    {
+        return;
+    }
 
     cl::NDRange gws((window.x().end() - window.x().start()) / window.x().step(),
                     (window.y().end() - window.y().start()) / window.y().step(),
@@ -61,7 +63,7 @@
 }
 
 ICLKernel::ICLKernel()
-    : _kernel(nullptr), _lws_hint(cl::Range_128_1), _target(CLScheduler::get().target())
+    : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id)
 {
 }
 
@@ -71,12 +73,6 @@
 }
 
 template <unsigned int dimension_size>
-unsigned int           ICLKernel::num_arguments_per_tensor() const
-{
-    return 2 + 2 * dimension_size;
-}
-
-template <unsigned int dimension_size>
 void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, const Window &window)
 {
     ARM_COMPUTE_ERROR_ON(tensor == nullptr);
@@ -123,6 +119,16 @@
     add_tensor_argument<3>(idx, tensor, window);
 }
 
+void ICLKernel::add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
+{
+    add_tensor_argument<4>(idx, tensor, window);
+}
+
+unsigned int ICLKernel::num_arguments_per_1D_array() const
+{
+    return num_arguments_per_array<1>();
+}
+
 unsigned int ICLKernel::num_arguments_per_1D_tensor() const
 {
     return num_arguments_per_tensor<1>();
@@ -138,6 +144,11 @@
     return num_arguments_per_tensor<3>();
 }
 
+unsigned int ICLKernel::num_arguments_per_4D_tensor() const
+{
+    return num_arguments_per_tensor<4>();
+}
+
 void ICLKernel::set_target(cl::Device &device)
 {
     _target = get_target_from_device(device);

diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index 3b8dfd2..1d04f39 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp

@@ -27,114 +27,94 @@
 #include <dlfcn.h>
 #include <iostream>
 
-using clBuildProgram_func            = cl_int (*)(cl_program, cl_uint, const cl_device_id *, const char *, void (*pfn_notify)(cl_program, void *), void *);
-using clEnqueueNDRangeKernel_func    = cl_int (*)(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
-using clSetKernelArg_func            = cl_int (*)(cl_kernel, cl_uint, size_t, const void *);
-using clReleaseMemObject_func        = cl_int (*)(cl_mem);
-using clEnqueueUnmapMemObject_func   = cl_int (*)(cl_command_queue, cl_mem, void *, cl_uint, const cl_event *, cl_event *);
-using clRetainCommandQueue_func      = cl_int (*)(cl_command_queue command_queue);
-using clReleaseContext_func          = cl_int (*)(cl_context);
-using clReleaseEvent_func            = cl_int (*)(cl_event);
-using clEnqueueWriteBuffer_func      = cl_int (*)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
-using clEnqueueReadBuffer_func       = cl_int (*)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *);
-using clGetProgramBuildInfo_func     = cl_int (*)(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *);
-using clRetainProgram_func           = cl_int (*)(cl_program program);
-using clEnqueueMapBuffer_func        = void *(*)(cl_command_queue, cl_mem, cl_bool, cl_map_flags, size_t, size_t, cl_uint, const cl_event *, cl_event *, cl_int *);
-using clReleaseCommandQueue_func     = cl_int (*)(cl_command_queue);
-using clCreateProgramWithBinary_func = cl_program (*)(cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *);
-using clRetainContext_func           = cl_int (*)(cl_context context);
-using clReleaseProgram_func          = cl_int (*)(cl_program program);
-using clFlush_func                   = cl_int (*)(cl_command_queue command_queue);
-using clGetProgramInfo_func          = cl_int (*)(cl_program, cl_program_info, size_t, void *, size_t *);
-using clCreateKernel_func            = cl_kernel (*)(cl_program, const char *, cl_int *);
-using clRetainKernel_func            = cl_int (*)(cl_kernel kernel);
-using clCreateBuffer_func            = cl_mem (*)(cl_context, cl_mem_flags, size_t, void *, cl_int *);
-using clCreateProgramWithSource_func = cl_program (*)(cl_context, cl_uint, const char **, const size_t *, cl_int *);
-using clReleaseKernel_func           = cl_int (*)(cl_kernel kernel);
-using clGetDeviceInfo_func           = cl_int (*)(cl_device_id, cl_device_info, size_t, void *, size_t *);
-using clGetDeviceIDs_func            = cl_int (*)(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *);
-
-class CLSymbols
+namespace arm_compute
 {
-private:
-    CLSymbols()
+CLSymbols &CLSymbols::get()
+{
+    static CLSymbols symbols;
+    return symbols;
+}
+
+bool CLSymbols::load_default()
+{
+    static const std::vector<std::string> libraries{ "libOpenCL.so", "libGLES_mali.so", "libmali.so" };
+
+    if(_loaded.first)
     {
-        void *handle = dlopen("libOpenCL.so", RTLD_LAZY | RTLD_LOCAL);
-        if(handle == nullptr)
+        return _loaded.second;
+    }
+
+    // Indicate that default loading has been tried
+    _loaded.first = true;
+
+    for(const auto &lib : libraries)
+    {
+        if(load(lib))
         {
-            std::cerr << "Can't load libOpenCL.so: " << dlerror() << std::endl;
-        }
-        else
-        {
-            clBuildProgram            = reinterpret_cast<clBuildProgram_func>(dlsym(handle, "clBuildProgram"));
-            clEnqueueNDRangeKernel    = reinterpret_cast<clEnqueueNDRangeKernel_func>(dlsym(handle, "clEnqueueNDRangeKernel"));
-            clSetKernelArg            = reinterpret_cast<clSetKernelArg_func>(dlsym(handle, "clSetKernelArg"));
-            clReleaseKernel           = reinterpret_cast<clReleaseKernel_func>(dlsym(handle, "clReleaseKernel"));
-            clCreateProgramWithSource = reinterpret_cast<clCreateProgramWithSource_func>(dlsym(handle, "clCreateProgramWithSource"));
-            clCreateBuffer            = reinterpret_cast<clCreateBuffer_func>(dlsym(handle, "clCreateBuffer"));
-            clRetainKernel            = reinterpret_cast<clRetainKernel_func>(dlsym(handle, "clRetainKernel"));
-            clCreateKernel            = reinterpret_cast<clCreateKernel_func>(dlsym(handle, "clCreateKernel"));
-            clGetProgramInfo          = reinterpret_cast<clGetProgramInfo_func>(dlsym(handle, "clGetProgramInfo"));
-            clFlush                   = reinterpret_cast<clFlush_func>(dlsym(handle, "clFlush"));
-            clReleaseProgram          = reinterpret_cast<clReleaseProgram_func>(dlsym(handle, "clReleaseProgram"));
-            clRetainContext           = reinterpret_cast<clRetainContext_func>(dlsym(handle, "clRetainContext"));
-            clCreateProgramWithBinary = reinterpret_cast<clCreateProgramWithBinary_func>(dlsym(handle, "clCreateProgramWithBinary"));
-            clReleaseCommandQueue     = reinterpret_cast<clReleaseCommandQueue_func>(dlsym(handle, "clReleaseCommandQueue"));
-            clEnqueueMapBuffer        = reinterpret_cast<clEnqueueMapBuffer_func>(dlsym(handle, "clEnqueueMapBuffer"));
-            clRetainProgram           = reinterpret_cast<clRetainProgram_func>(dlsym(handle, "clRetainProgram"));
-            clGetProgramBuildInfo     = reinterpret_cast<clGetProgramBuildInfo_func>(dlsym(handle, "clGetProgramBuildInfo"));
-            clEnqueueReadBuffer       = reinterpret_cast<clEnqueueReadBuffer_func>(dlsym(handle, "clEnqueueReadBuffer"));
-            clEnqueueWriteBuffer      = reinterpret_cast<clEnqueueWriteBuffer_func>(dlsym(handle, "clEnqueueWriteBuffer"));
-            clReleaseEvent            = reinterpret_cast<clReleaseEvent_func>(dlsym(handle, "clReleaseEvent"));
-            clReleaseContext          = reinterpret_cast<clReleaseContext_func>(dlsym(handle, "clReleaseContext"));
-            clRetainCommandQueue      = reinterpret_cast<clRetainCommandQueue_func>(dlsym(handle, "clRetainCommandQueue"));
-            clEnqueueUnmapMemObject   = reinterpret_cast<clEnqueueUnmapMemObject_func>(dlsym(handle, "clEnqueueUnmapMemObject"));
-            clReleaseMemObject        = reinterpret_cast<clReleaseMemObject_func>(dlsym(handle, "clReleaseMemObject"));
-            clGetDeviceInfo           = reinterpret_cast<clGetDeviceInfo_func>(dlsym(handle, "clGetDeviceInfo"));
-            clGetDeviceIDs            = reinterpret_cast<clGetDeviceIDs_func>(dlsym(handle, "clGetDeviceIDs"));
-            dlclose(handle);
+            return true;
         }
     }
 
-public:
-    static CLSymbols &get()
+    std::cerr << "Couldn't find any OpenCL library.\n";
+    return false;
+}
+
+bool CLSymbols::load(const std::string &library)
+{
+    void *handle = dlopen(library.c_str(), RTLD_LAZY | RTLD_LOCAL);
+
+    if(handle == nullptr)
     {
-        static CLSymbols symbols = CLSymbols();
-        return symbols;
+        std::cerr << "Can't load " << library << ": " << dlerror() << "\n";
+        // Set status of loading to failed
+        _loaded.second = false;
+        return false;
     }
 
-    clBuildProgram_func            clBuildProgram            = nullptr;
-    clEnqueueNDRangeKernel_func    clEnqueueNDRangeKernel    = nullptr;
-    clSetKernelArg_func            clSetKernelArg            = nullptr;
-    clReleaseKernel_func           clReleaseKernel           = nullptr;
-    clCreateProgramWithSource_func clCreateProgramWithSource = nullptr;
-    clCreateBuffer_func            clCreateBuffer            = nullptr;
-    clRetainKernel_func            clRetainKernel            = nullptr;
-    clCreateKernel_func            clCreateKernel            = nullptr;
-    clGetProgramInfo_func          clGetProgramInfo          = nullptr;
-    clFlush_func                   clFlush                   = nullptr;
-    clReleaseProgram_func          clReleaseProgram          = nullptr;
-    clRetainContext_func           clRetainContext           = nullptr;
-    clCreateProgramWithBinary_func clCreateProgramWithBinary = nullptr;
-    clReleaseCommandQueue_func     clReleaseCommandQueue     = nullptr;
-    clEnqueueMapBuffer_func        clEnqueueMapBuffer        = nullptr;
-    clRetainProgram_func           clRetainProgram           = nullptr;
-    clGetProgramBuildInfo_func     clGetProgramBuildInfo     = nullptr;
-    clEnqueueReadBuffer_func       clEnqueueReadBuffer       = nullptr;
-    clEnqueueWriteBuffer_func      clEnqueueWriteBuffer      = nullptr;
-    clReleaseEvent_func            clReleaseEvent            = nullptr;
-    clReleaseContext_func          clReleaseContext          = nullptr;
-    clRetainCommandQueue_func      clRetainCommandQueue      = nullptr;
-    clEnqueueUnmapMemObject_func   clEnqueueUnmapMemObject   = nullptr;
-    clReleaseMemObject_func        clReleaseMemObject        = nullptr;
-    clGetDeviceInfo_func           clGetDeviceInfo           = nullptr;
-    clGetDeviceIDs_func            clGetDeviceIDs            = nullptr;
-};
+    clBuildProgram            = reinterpret_cast<clBuildProgram_func>(dlsym(handle, "clBuildProgram"));
+    clEnqueueNDRangeKernel    = reinterpret_cast<clEnqueueNDRangeKernel_func>(dlsym(handle, "clEnqueueNDRangeKernel"));
+    clSetKernelArg            = reinterpret_cast<clSetKernelArg_func>(dlsym(handle, "clSetKernelArg"));
+    clReleaseKernel           = reinterpret_cast<clReleaseKernel_func>(dlsym(handle, "clReleaseKernel"));
+    clCreateProgramWithSource = reinterpret_cast<clCreateProgramWithSource_func>(dlsym(handle, "clCreateProgramWithSource"));
+    clCreateBuffer            = reinterpret_cast<clCreateBuffer_func>(dlsym(handle, "clCreateBuffer"));
+    clRetainKernel            = reinterpret_cast<clRetainKernel_func>(dlsym(handle, "clRetainKernel"));
+    clCreateKernel            = reinterpret_cast<clCreateKernel_func>(dlsym(handle, "clCreateKernel"));
+    clGetProgramInfo          = reinterpret_cast<clGetProgramInfo_func>(dlsym(handle, "clGetProgramInfo"));
+    clFlush                   = reinterpret_cast<clFlush_func>(dlsym(handle, "clFlush"));
+    clFinish                  = reinterpret_cast<clFinish_func>(dlsym(handle, "clFinish"));
+    clReleaseProgram          = reinterpret_cast<clReleaseProgram_func>(dlsym(handle, "clReleaseProgram"));
+    clRetainContext           = reinterpret_cast<clRetainContext_func>(dlsym(handle, "clRetainContext"));
+    clCreateProgramWithBinary = reinterpret_cast<clCreateProgramWithBinary_func>(dlsym(handle, "clCreateProgramWithBinary"));
+    clReleaseCommandQueue     = reinterpret_cast<clReleaseCommandQueue_func>(dlsym(handle, "clReleaseCommandQueue"));
+    clEnqueueMapBuffer        = reinterpret_cast<clEnqueueMapBuffer_func>(dlsym(handle, "clEnqueueMapBuffer"));
+    clRetainProgram           = reinterpret_cast<clRetainProgram_func>(dlsym(handle, "clRetainProgram"));
+    clGetProgramBuildInfo     = reinterpret_cast<clGetProgramBuildInfo_func>(dlsym(handle, "clGetProgramBuildInfo"));
+    clEnqueueReadBuffer       = reinterpret_cast<clEnqueueReadBuffer_func>(dlsym(handle, "clEnqueueReadBuffer"));
+    clEnqueueWriteBuffer      = reinterpret_cast<clEnqueueWriteBuffer_func>(dlsym(handle, "clEnqueueWriteBuffer"));
+    clReleaseEvent            = reinterpret_cast<clReleaseEvent_func>(dlsym(handle, "clReleaseEvent"));
+    clReleaseContext          = reinterpret_cast<clReleaseContext_func>(dlsym(handle, "clReleaseContext"));
+    clRetainCommandQueue      = reinterpret_cast<clRetainCommandQueue_func>(dlsym(handle, "clRetainCommandQueue"));
+    clEnqueueUnmapMemObject   = reinterpret_cast<clEnqueueUnmapMemObject_func>(dlsym(handle, "clEnqueueUnmapMemObject"));
+    clRetainMemObject         = reinterpret_cast<clRetainMemObject_func>(dlsym(handle, "clRetainMemObject"));
+    clReleaseMemObject        = reinterpret_cast<clReleaseMemObject_func>(dlsym(handle, "clReleaseMemObject"));
+    clGetDeviceInfo           = reinterpret_cast<clGetDeviceInfo_func>(dlsym(handle, "clGetDeviceInfo"));
+    clGetDeviceIDs            = reinterpret_cast<clGetDeviceIDs_func>(dlsym(handle, "clGetDeviceIDs"));
+    clRetainEvent             = reinterpret_cast<clRetainEvent_func>(dlsym(handle, "clRetainEvent"));
 
-bool arm_compute::opencl_is_available()
+    dlclose(handle);
+
+    // Disable default loading and set status to successful
+    _loaded = std::make_pair(true, true);
+
+    return true;
+}
+
+bool opencl_is_available()
 {
+    CLSymbols::get().load_default();
     return CLSymbols::get().clBuildProgram != nullptr;
 }
+} // namespace arm_compute
 
 cl_int clBuildProgram(
     cl_program          program,
@@ -144,7 +124,8 @@
     void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
     void *user_data)
 {
-    auto func = CLSymbols::get().clBuildProgram;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clBuildProgram;
     if(func != nullptr)
     {
         return func(program, num_devices, device_list, options, pfn_notify, user_data);
@@ -166,7 +147,8 @@
     const cl_event *event_wait_list,
     cl_event        *event)
 {
-    auto func = CLSymbols::get().clEnqueueNDRangeKernel;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clEnqueueNDRangeKernel;
     if(func != nullptr)
     {
         return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event);
@@ -183,7 +165,8 @@
     size_t      arg_size,
     const void *arg_value)
 {
-    auto func = CLSymbols::get().clSetKernelArg;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clSetKernelArg;
     if(func != nullptr)
     {
         return func(kernel, arg_index, arg_size, arg_value);
@@ -194,9 +177,24 @@
     }
 }
 
+cl_int clRetainMemObject(cl_mem memobj)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clRetainMemObject;
+    if(func != nullptr)
+    {
+        return func(memobj);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
 cl_int clReleaseMemObject(cl_mem memobj)
 {
-    auto func = CLSymbols::get().clReleaseMemObject;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clReleaseMemObject;
     if(func != nullptr)
     {
         return func(memobj);
@@ -215,7 +213,8 @@
     const cl_event *event_wait_list,
     cl_event        *event)
 {
-    auto func = CLSymbols::get().clEnqueueUnmapMemObject;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clEnqueueUnmapMemObject;
     if(func != nullptr)
     {
         return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event);
@@ -228,7 +227,8 @@
 
 cl_int clRetainCommandQueue(cl_command_queue command_queue)
 {
-    auto func = CLSymbols::get().clRetainCommandQueue;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clRetainCommandQueue;
     if(func != nullptr)
     {
         return func(command_queue);
@@ -241,7 +241,8 @@
 
 cl_int clReleaseContext(cl_context context)
 {
-    auto func = CLSymbols::get().clReleaseContext;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clReleaseContext;
     if(func != nullptr)
     {
         return func(context);
@@ -253,7 +254,8 @@
 }
 cl_int clReleaseEvent(cl_event event)
 {
-    auto func = CLSymbols::get().clReleaseEvent;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clReleaseEvent;
     if(func != nullptr)
     {
         return func(event);
@@ -275,7 +277,8 @@
     const cl_event *event_wait_list,
     cl_event        *event)
 {
-    auto func = CLSymbols::get().clEnqueueWriteBuffer;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clEnqueueWriteBuffer;
     if(func != nullptr)
     {
         return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
@@ -297,7 +300,8 @@
     const cl_event *event_wait_list,
     cl_event        *event)
 {
-    auto func = CLSymbols::get().clEnqueueReadBuffer;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clEnqueueReadBuffer;
     if(func != nullptr)
     {
         return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
@@ -316,7 +320,8 @@
     void                 *param_value,
     size_t               *param_value_size_ret)
 {
-    auto func = CLSymbols::get().clGetProgramBuildInfo;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clGetProgramBuildInfo;
     if(func != nullptr)
     {
         return func(program, device, param_name, param_value_size, param_value, param_value_size_ret);
@@ -329,7 +334,8 @@
 
 cl_int clRetainProgram(cl_program program)
 {
-    auto func = CLSymbols::get().clRetainProgram;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clRetainProgram;
     if(func != nullptr)
     {
         return func(program);
@@ -352,7 +358,8 @@
     cl_event        *event,
     cl_int          *errcode_ret)
 {
-    auto func = CLSymbols::get().clEnqueueMapBuffer;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clEnqueueMapBuffer;
     if(func != nullptr)
     {
         return func(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, event_wait_list, event, errcode_ret);
@@ -369,7 +376,8 @@
 
 cl_int clReleaseCommandQueue(cl_command_queue command_queue)
 {
-    auto func = CLSymbols::get().clReleaseCommandQueue;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clReleaseCommandQueue;
     if(func != nullptr)
     {
         return func(command_queue);
@@ -389,7 +397,8 @@
     cl_int               *binary_status,
     cl_int               *errcode_ret)
 {
-    auto func = CLSymbols::get().clCreateProgramWithBinary;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clCreateProgramWithBinary;
     if(func != nullptr)
     {
         return func(context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret);
@@ -406,7 +415,8 @@
 
 cl_int clRetainContext(cl_context context)
 {
-    auto func = CLSymbols::get().clRetainContext;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clRetainContext;
     if(func != nullptr)
     {
         return func(context);
@@ -419,7 +429,8 @@
 
 cl_int clReleaseProgram(cl_program program)
 {
-    auto func = CLSymbols::get().clReleaseProgram;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clReleaseProgram;
     if(func != nullptr)
     {
         return func(program);
@@ -432,7 +443,22 @@
 
 cl_int clFlush(cl_command_queue command_queue)
 {
-    auto func = CLSymbols::get().clFlush;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clFlush;
+    if(func != nullptr)
+    {
+        return func(command_queue);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clFinish(cl_command_queue command_queue)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clFinish;
     if(func != nullptr)
     {
         return func(command_queue);
@@ -450,7 +476,8 @@
     void           *param_value,
     size_t         *param_value_size_ret)
 {
-    auto func = CLSymbols::get().clGetProgramInfo;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clGetProgramInfo;
     if(func != nullptr)
     {
         return func(program, param_name, param_value_size, param_value, param_value_size_ret);
@@ -466,7 +493,8 @@
     const char *kernel_name,
     cl_int     *errcode_ret)
 {
-    auto func = CLSymbols::get().clCreateKernel;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clCreateKernel;
     if(func != nullptr)
     {
         return func(program, kernel_name, errcode_ret);
@@ -483,7 +511,8 @@
 
 cl_int clRetainKernel(cl_kernel kernel)
 {
-    auto func = CLSymbols::get().clRetainKernel;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clRetainKernel;
     if(func != nullptr)
     {
         return func(kernel);
@@ -501,7 +530,8 @@
     void        *host_ptr,
     cl_int      *errcode_ret)
 {
-    auto func = CLSymbols::get().clCreateBuffer;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clCreateBuffer;
     if(func != nullptr)
     {
         return func(context, flags, size, host_ptr, errcode_ret);
@@ -523,7 +553,8 @@
     const size_t *lengths,
     cl_int       *errcode_ret)
 {
-    auto func = CLSymbols::get().clCreateProgramWithSource;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clCreateProgramWithSource;
     if(func != nullptr)
     {
         return func(context, count, strings, lengths, errcode_ret);
@@ -540,7 +571,8 @@
 
 cl_int clReleaseKernel(cl_kernel kernel)
 {
-    auto func = CLSymbols::get().clReleaseKernel;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clReleaseKernel;
     if(func != nullptr)
     {
         return func(kernel);
@@ -557,7 +589,8 @@
                       cl_device_id *devices,
                       cl_uint       *num_devices)
 {
-    auto func = CLSymbols::get().clGetDeviceIDs;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clGetDeviceIDs;
     if(func != nullptr)
     {
         return func(platform, device_type, num_entries, devices, num_devices);
@@ -574,7 +607,8 @@
                        void          *param_value,
                        size_t        *param_value_size_ret)
 {
-    auto func = CLSymbols::get().clGetDeviceInfo;
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clGetDeviceInfo;
     if(func != nullptr)
     {
         return func(device, param_name, param_value_size, param_value, param_value_size_ret);
@@ -584,3 +618,17 @@
         return CL_OUT_OF_RESOURCES;
     }
 }
+
+cl_int clRetainEvent(cl_event event)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clRetainEvent;
+    if(func != nullptr)
+    {
+        return func(event);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}

diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/activation_layer.cl
index e3cbb6c..4424a66 100644
--- a/src/core/CL/cl_kernels/activation_layer.cl
+++ b/src/core/CL/cl_kernels/activation_layer.cl

@@ -23,14 +23,109 @@
  */
 #include "helpers.h"
 
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#if defined(FIXED_POINT_POSITION)
+#include "fixed_point.h"
+
+#define CONST_ONE (1 << FIXED_POINT_POSITION)
+#define ABS_OP(a) ABS_SAT_OP_EXPAND((a), DATA_TYPE, VEC_SIZE)
+#define ADD_OP(a, b) ADD_SAT_OP_EXPAND((a), (b), DATA_TYPE, VEC_SIZE)
+#define SUB_OP(a, b) SUB_SAT_OP_EXPAND((a), (b), DATA_TYPE, VEC_SIZE)
+#define MUL_OP(a, b) MUL_SAT_OP_EXPAND((a), (b), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+#define MLA_OP(a, b, c) MLA_SAT_OP_EXPAND((a), (b), (c), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+#define DIV_OP(a, b) DIV_SAT_OP_VEC_EXPAND((a), (b), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+#define EXP_OP(a) EXP_OP_EXPAND((a), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+#define LOG_OP(a) LOG_OP_EXPAND((a), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+#define SQRT_OP(a) DIV_OP(CONST_ONE, INVSQRT_OP_EXPAND((a), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION))
+#define TANH_OP(a) TANH_OP_EXPAND((a), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+
+#else /* FIXED_POINT_POSITION */
+
+#define CONST_ONE 1.f
+#define ABS_OP(a) fabs((a))
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define MLA_OP(a, b, c) ((b) * (c) + (a))
+#define DIV_OP(a, b) ((a) / (b))
+#define EXP_OP(a) exp((a))
+#define LOG_OP(a) log((a))
+#define SQRT_OP(a) sqrt((a))
+#define TANH_OP(a) tanh((a))
+
+#endif /* FIXED_POINT_POSITION */
+
+// Logistic Activation
+inline TYPE logistic_op(TYPE x)
+{
+    return DIV_OP((TYPE)CONST_ONE, ADD_OP((TYPE)CONST_ONE, EXP_OP(-x)));
+}
+// Hyperbolic Tangent Activation
+inline TYPE tanh_op(TYPE x)
+{
+    return MUL_OP((TYPE)A_VAL, TANH_OP(MUL_OP((TYPE)B_VAL, x)));
+}
+// RELU Tangent Activation
+inline TYPE relu_op(TYPE x)
+{
+    return max(0, x);
+}
+// Bounded RELU Activation
+inline TYPE brelu_op(TYPE x)
+{
+    return min((TYPE)A_VAL, max(0, x));
+}
+// Lower Upper Bounded RELU Activation
+inline TYPE lu_brelu_op(TYPE x)
+{
+    return min(max(x, (TYPE)B_VAL), (TYPE)A_VAL);
+}
+// Leaky RELU Activation
+inline TYPE lrelu_op(TYPE x)
+{
+    return select(MUL_OP((TYPE)A_VAL, x), x, x > (TYPE)0);
+}
+// Soft RELU Activation
+inline TYPE srelu_op(TYPE x)
+{
+    return LOG_OP(ADD_OP((TYPE)CONST_ONE, EXP_OP(x)));
+}
+// Absolute Activation
+inline TYPE abs_op(TYPE x)
+{
+    return ABS_OP(x);
+}
+// Square Activation
+inline TYPE square_op(TYPE x)
+{
+    return MUL_OP(x, x);
+}
+// Square-root Activation
+inline TYPE sqrt_op(TYPE x)
+{
+    return SQRT_OP(x);
+}
+// Linear Activation
+inline TYPE linear_op(TYPE x)
+{
+    return MLA_OP((TYPE)B_VAL, (TYPE)A_VAL, x);
+}
+
+#define ACTIVATION_OP2(op, x) op##_op(x)
+#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+
 /** This performs an activation function floating point inputs.
  *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Activation function should be given as a preprocessor argument using -DNAME. e.g. -DTANH
- * @note Distinction between floating point and integer is done using -DTYPE_FP and -DTYPE_INT preprocessor argument
- * @note A, B variables required by some activation functions are set using -DA= and -DB= respectively.
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16, F32
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note In case of fixed point calculations the fixed point position is passed using -DFIXED_POINT_POSITION=position. e.g. -DFIXED_POINT_POSITION=3.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QS8/QS16/F16/F32
  * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
@@ -38,7 +133,7 @@
  * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: F16, F32
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
  * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
@@ -48,42 +143,28 @@
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
  */
 __kernel void activation_layer(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
 {
     // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
     Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
 
     // Load data
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    data = vload16(0, (__global DATA_TYPE *)input.ptr);
+    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
 
     // Perform activation
-#if defined LOGISTIC
-    data = 1 / (1 + exp(-data));
-#elif defined TANH
-    data = (VEC_DATA_TYPE(DATA_TYPE, 16))A * tanh((VEC_DATA_TYPE(DATA_TYPE, 16))B * data);
-#elif defined RELU
-    data = max(0, data);
-#elif defined BRELU
-    data = min((VEC_DATA_TYPE(DATA_TYPE, 16))A, max(0, data));
-#elif defined SRELU
-    data = log(1 + exp(data));
-#elif defined ABS
-#if defined   TYPE_INT
-    data = abs(data);
-#else
-    data = fabs(data);
-#endif
-#elif defined SQUARE
-    data = data * data;
-#elif defined SQRT
-    data = sqrt(data);
-#elif defined LINEAR
-    data = (VEC_DATA_TYPE(DATA_TYPE, 16))A * data + (VEC_DATA_TYPE(DATA_TYPE, 16))B;
-#endif
+    data = ACTIVATION_OP(ACT, data);
 
     // Store result
-    vstore16(data, 0, (__global DATA_TYPE *)output.ptr);
+    VSTORE(VEC_SIZE)
+    (data, 0, (__global DATA_TYPE *)output.ptr);
 }

diff --git a/src/core/CL/cl_kernels/arithmetic_op.cl b/src/core/CL/cl_kernels/arithmetic_op.cl
index 434300e..0341410 100644
--- a/src/core/CL/cl_kernels/arithmetic_op.cl
+++ b/src/core/CL/cl_kernels/arithmetic_op.cl

@@ -23,13 +23,17 @@
  */
 #include "helpers.h"
 
+#if defined(FIXED_POINT_POSITION)
+#include "fixed_point.h"
+#endif /* FIXED_POINT_POSITION */
+
 #ifdef SATURATE
 #define ADD(x, y) add_sat((x), (y))
 #define SUB(x, y) sub_sat((x), (y))
-#else
+#else /* SATURATE */
 #define ADD(x, y) (x) + (y)
 #define SUB(x, y) (x) - (y)
-#endif
+#endif /* SATURATE */
 
 /** This function add two images.
  *
@@ -37,19 +41,19 @@
  * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
  * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
  *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8/QS8/QS16/S16/F16/F32
  * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
  * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8/QS8 (only if @p in1_ptr is QS8), QS16 (only if @p in1_ptr is QS16), S16/F16/F32
  * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
  * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8 (only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32
  * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
  * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
index 13e6702..b7423d8 100644
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl

@@ -23,9 +23,28 @@
  */
 #include "helpers.h"
 
+#if defined(FIXED_POINT_POSITION)
+#include "fixed_point.h"
+
+#define ADD_OP(a, b) ADD_SAT_OP_EXPAND((a), (b), DATA_TYPE, VEC_SIZE)
+#define SUB_OP(a, b) SUB_SAT_OP_EXPAND((a), (b), DATA_TYPE, VEC_SIZE)
+#define MUL_OP(a, b) MUL_SAT_OP_EXPAND((a), (b), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+#define INVSQRT_OP(a) INVSQRT_OP_EXPAND((a), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+#define SQCVT_SAT(a) SQCVT_SAT_OP_EXPAND((a), DATA_TYPE, FIXED_POINT_POSITION)
+
+#else /* FIXED_POINT_POSITION */
+
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define INVSQRT_OP(a) rsqrt((a))
+#define SQCVT_SAT(a) (a)
+
+#endif /* FIXED_POINT_POSITION */
+
 /** Apply batch normalization.
  *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F32
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: QS8/QS16/F32
  * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
@@ -33,7 +52,7 @@
  * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
  * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
@@ -41,59 +60,72 @@
  * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: F32
+ * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p input_ptr
  * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
  * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
- * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: F32
+ * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p input_ptr
  * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
  * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
- * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: F32
+ * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p input_ptr
  * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
  * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
- * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: F32
+ * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
  * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
  * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
  * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
  */
 __kernel void batchnormalization_layer(TENSOR3D_DECLARATION(input),
+#ifndef IN_PLACE
                                        TENSOR3D_DECLARATION(output),
+#endif /* not IN_PLACE */
                                        VECTOR_DECLARATION(mean),
                                        VECTOR_DECLARATION(var),
                                        VECTOR_DECLARATION(beta),
                                        VECTOR_DECLARATION(gamma),
                                        float epsilon)
 {
-    Tensor3D in    = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
-    Vector   mean  = CONVERT_TO_VECTOR_STRUCT(mean);
-    Vector   var   = CONVERT_TO_VECTOR_STRUCT(var);
-    Vector   beta  = CONVERT_TO_VECTOR_STRUCT(beta);
-    Vector   gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+    Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D out = in;
+#else  /* IN_PLACE */
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+    Vector mean  = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector var   = CONVERT_TO_VECTOR_STRUCT(var);
+    Vector beta  = CONVERT_TO_VECTOR_STRUCT(beta);
+    Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
 
-    float4 _in         = 0;
-    float4 denominator = 0;
-    float4 numerator   = 0;
-    float4 x_bar       = 0;
-    float4 gamma_vec   = 0;
-    float4 beta_vec    = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    _in = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    denominator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    numerator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    x_bar = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    gamma_vec = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_vec = 0;
 
     const int current_slice = get_global_id(2);
 
-    _in         = vload4(0, (__global float *)in.ptr);
-    denominator = *((__global float *)(var.ptr + current_slice * var.stride_x));
-    denominator = rsqrt(denominator + epsilon);
+    _in         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+    denominator = *((__global DATA_TYPE *)(var.ptr + current_slice * var.stride_x));
+    denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(epsilon)));
 
     // Calculate x bar and store results
-    numerator = *((__global float *)(mean.ptr + current_slice * mean.stride_x));
-    numerator = _in - numerator;
-    x_bar     = numerator * denominator;
+    numerator = *((__global DATA_TYPE *)(mean.ptr + current_slice * mean.stride_x));
+    numerator = SUB_OP(_in, numerator);
+    x_bar     = MUL_OP(numerator, denominator);
 
-    gamma_vec = *((__global float *)(gamma.ptr + current_slice * beta.stride_x));
-    beta_vec  = *((__global float *)(beta.ptr + current_slice * beta.stride_x));
+    gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * beta.stride_x));
+    beta_vec  = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x));
 
-    vstore4(gamma_vec * x_bar + beta_vec, 0, (__global float *)out.ptr);
+    VSTORE(VEC_SIZE)
+    (ADD_OP(MUL_OP(gamma_vec, x_bar), beta_vec), 0, (__global DATA_TYPE *)out.ptr);
 }

diff --git a/src/core/CL/cl_kernels/channel_combine.cl b/src/core/CL/cl_kernels/channel_combine.cl
index 93e80b9..d309812 100644
--- a/src/core/CL/cl_kernels/channel_combine.cl
+++ b/src/core/CL/cl_kernels/channel_combine.cl

@@ -337,11 +337,11 @@
     uchar8 data1 = vload8(0, src_plane1.ptr);
     uchar8 data2 = vload8(0, src_plane2.ptr);
 
-#if defined NV12
+#ifdef NV12
     vstore16(shuffle2(data1, data2, (uchar16)(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15)), 0, dst_plane1.ptr);
-#elif defined NV21
+#elif defined(NV21)
     vstore16(shuffle2(data2, data1, (uchar16)(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15)), 0, dst_plane1.ptr);
-#endif
+#endif /* NV12 or NV21 */
 }
 
 /** This function combines three planes to a single YUV444 or IYUV image.
@@ -405,12 +405,12 @@
 
     // Copy plane data
     vstore16(vload16(0, src_plane0.ptr), 0, dst_plane0.ptr);
-#if defined YUV444
+#ifdef YUV444
     vstore16(vload16(0, src_plane1.ptr), 0, dst_plane1.ptr);
     vstore16(vload16(0, src_plane2.ptr), 0, dst_plane2.ptr);
-#elif defined IYUV
+#elif defined(IYUV)
     vstore16(vload16(0, offset(&src_plane0, 0, height)), 0, (__global uchar *)offset(&dst_plane0, 0, height));
     vstore8(vload8(0, src_plane1.ptr), 0, dst_plane1.ptr);
     vstore8(vload8(0, src_plane2.ptr), 0, dst_plane2.ptr);
-#endif
+#endif /* YUV444 or IYUV */
 }

diff --git a/src/core/CL/cl_kernels/channel_extract.cl b/src/core/CL/cl_kernels/channel_extract.cl
index 14c6c8a9..e95bda4 100644
--- a/src/core/CL/cl_kernels/channel_extract.cl
+++ b/src/core/CL/cl_kernels/channel_extract.cl

@@ -51,16 +51,16 @@
     uchar16 data  = vload16(0, src.ptr);
     uchar8  data2 = vload8(0, src.ptr + 16);
 
-#if defined CHANNEL_R
+#ifdef CHANNEL_R
     vstore4(data.s0369, 0, dst.ptr);
     vstore4((uchar4)(data.sCF, data2.s25), 0, dst.ptr + 4);
-#elif defined CHANNEL_G
+#elif defined(CHANNEL_G)
     vstore4(data.s147A, 0, dst.ptr);
     vstore4((uchar4)(data.sD, data2.s036), 0, dst.ptr + 4);
-#elif defined CHANNEL_B
+#elif defined(CHANNEL_B)
     vstore4(data.s258B, 0, dst.ptr);
     vstore4((uchar4)(data.sE, data2.s147), 0, dst.ptr + 4);
-#endif
+#endif /* CHANNEL_R or CHANNEL_G or CHANNEL_B */
 }
 
 /** This function extracts a given channel from an RGBA image.
@@ -91,15 +91,15 @@
     uchar16 data  = vload16(0, src.ptr);
     uchar16 data2 = vload16(0, src.ptr + 16);
 
-#if defined CHANNEL_R
+#ifdef CHANNEL_R
     vstore8((uchar8)(data.s048C, data2.s048C), 0, dst.ptr);
-#elif defined CHANNEL_G
+#elif defined(CHANNEL_G)
     vstore8((uchar8)(data.s159D, data2.s159D), 0, dst.ptr);
-#elif defined CHANNEL_B
+#elif defined(CHANNEL_B)
     vstore8((uchar8)(data.s26AE, data2.s26AE), 0, dst.ptr);
-#elif defined CHANNEL_A
+#elif defined(CHANNEL_A)
     vstore8((uchar8)(data.s37BF, data2.s37BF), 0, dst.ptr);
-#endif
+#endif /* CHANNEL_R or CHANNEL_G or CHANNEL_B or CHANNEL_A */
 }
 
 /** This function extracts a given channel from an YUYV image.
@@ -129,13 +129,13 @@
 
     uchar16 data = vload16(0, src.ptr);
 
-#if defined CHANNEL_Y
+#ifdef CHANNEL_Y
     vstore8(data.s02468ACE, 0, dst.ptr);
-#elif defined CHANNEL_U
+#elif defined(CHANNEL_U)
     vstore4(data.s159D, 0, dst.ptr);
-#elif defined CHANNEL_V
+#elif defined(CHANNEL_V)
     vstore4(data.s37BF, 0, dst.ptr);
-#endif
+#endif /* CHANNEL_Y or CHANNEL_U or CHANNEL_V */
 }
 
 /** This function extracts a given channel from an UYUV image.
@@ -165,13 +165,13 @@
 
     uchar16 data = vload16(0, src.ptr);
 
-#if defined CHANNEL_Y
+#ifdef CHANNEL_Y
     vstore8(data.s13579BDF, 0, dst.ptr);
-#elif defined CHANNEL_U
+#elif defined(CHANNEL_U)
     vstore4(data.s048C, 0, dst.ptr);
-#elif defined CHANNEL_V
+#elif defined(CHANNEL_V)
     vstore4(data.s26AE, 0, dst.ptr);
-#endif
+#endif /* CHANNEL_Y or CHANNEL_U or CHANNEL_V */
 }
 
 /** This function extracts a given channel from an NV12 image.
@@ -202,11 +202,11 @@
 
     uchar16 data = vload16(0, src.ptr);
 
-#if defined CHANNEL_U
+#ifdef CHANNEL_U
     vstore8(data.s02468ACE, 0, dst.ptr);
-#elif defined CHANNEL_V
+#elif defined(CHANNEL_V)
     vstore8(data.s13579BDF, 0, dst.ptr);
-#endif
+#endif /* CHANNEL_U or CHANNEL_V */
 }
 
 /** This function extracts a given channel from an NV21 image.
@@ -237,11 +237,11 @@
 
     uchar16 data = vload16(0, src.ptr);
 
-#if defined CHANNEL_U
+#ifdef CHANNEL_U
     vstore8(data.s13579BDF, 0, dst.ptr);
-#elif defined CHANNEL_V
+#elif defined(CHANNEL_V)
     vstore8(data.s02468ACE, 0, dst.ptr);
-#endif
+#endif /* CHANNEL_U or CHANNEL_V */
 }
 
 /** This function extracts a given plane from an multi-planar image.

diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
index 00f5189..a92ab5b 100644
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ b/src/core/CL/cl_kernels/concatenate.cl

@@ -25,29 +25,35 @@
 
 /** This kernel concatenates the input tensor into the output tensor along the third dimension
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8, QS16, F16, F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  offset                            The offset to the first valid element of the output tensor in bytes
+ * @param[in]  offsets                           The offsets to the first valid element of the output tensor in bytes
  */
 __kernel void concatenate_depth(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    unsigned int offset)
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    int3 offsets)
 {
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
 
-    float4 source_values = vload4(0, (__global float *)src.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, -offsets.x, -offsets.y, 0));
 
-    vstore4(source_values, 0, (__global float *)(dst.ptr + offset));
+    VSTORE(VEC_SIZE)
+    (source_values, 0, (__global DATA_TYPE *)(dst.ptr + offsets.z));
 }

diff --git a/src/core/CL/cl_kernels/convolution3x3.cl b/src/core/CL/cl_kernels/convolution3x3.cl
index 3733d0c..8c75ecd 100644
--- a/src/core/CL/cl_kernels/convolution3x3.cl
+++ b/src/core/CL/cl_kernels/convolution3x3.cl

@@ -25,11 +25,11 @@
 
 #ifndef DATA_TYPE
 #define DATA_TYPE short
-#endif
+#endif /* DATA_TYPE */
 
 #ifndef DATA_TYPE_OUT
 #define DATA_TYPE_OUT uchar
-#endif
+#endif /* DATA_TYPE_OUT */
 
 /** Compute a 1D horizontal convolution of size 3 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
  *

diff --git a/src/core/CL/cl_kernels/convolution5x5.cl b/src/core/CL/cl_kernels/convolution5x5.cl
index d1335c5..605cd09 100644
--- a/src/core/CL/cl_kernels/convolution5x5.cl
+++ b/src/core/CL/cl_kernels/convolution5x5.cl

@@ -25,15 +25,15 @@
 
 #ifndef DATA_TYPE
 #define DATA_TYPE short
-#endif
+#endif /* DATA_TYPE */
 
 #ifndef COMPUTE_TYPE
 #define COMPUTE_TYPE int
-#endif
+#endif /* COMPUTE_TYPE */
 
 #ifndef DATA_TYPE_OUT
 #define DATA_TYPE_OUT uchar
-#endif
+#endif /* DATA_TYPE_OUT */
 
 /** Compute a 1D horizontal convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
  *

diff --git a/src/core/CL/cl_kernels/convolution7x7.cl b/src/core/CL/cl_kernels/convolution7x7.cl
index 74a0055..1abfb15 100644
--- a/src/core/CL/cl_kernels/convolution7x7.cl
+++ b/src/core/CL/cl_kernels/convolution7x7.cl

@@ -25,15 +25,15 @@
 
 #ifndef DATA_TYPE
 #define DATA_TYPE short
-#endif
+#endif /* DATA_TYPE */
 
 #ifndef COMPUTE_TYPE
 #define COMPUTE_TYPE int
-#endif
+#endif /* COMPUTE_TYPE */
 
 #ifndef DATA_TYPE_OUT
 #define DATA_TYPE_OUT uchar
-#endif
+#endif /* DATA_TYPE_OUT */
 
 /** Compute a 1D horizontal convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
  *

diff --git a/src/core/CL/cl_kernels/convolution9x9.cl b/src/core/CL/cl_kernels/convolution9x9.cl
index d8b07ca..f537326 100644
--- a/src/core/CL/cl_kernels/convolution9x9.cl
+++ b/src/core/CL/cl_kernels/convolution9x9.cl

@@ -25,15 +25,15 @@
 
 #ifndef DATA_TYPE
 #define DATA_TYPE short
-#endif
+#endif /* DATA_TYPE */
 
 #ifndef COMPUTE_TYPE
 #define COMPUTE_TYPE int
-#endif
+#endif /* COMPUTE_TYPE */
 
 #ifndef DATA_TYPE_OUT
 #define DATA_TYPE_OUT uchar
-#endif
+#endif /* DATA_TYPE_OUT */
 
 /** Compute a 1D horizontal convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
  *

diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/convolution_layer.cl
index bd5dfaf..9e9d0b0 100644
--- a/src/core/CL/cl_kernels/convolution_layer.cl
+++ b/src/core/CL/cl_kernels/convolution_layer.cl

@@ -23,11 +23,15 @@
  */
 #include "helpers.h"
 
+#if defined(FIXED_POINT_POSITION)
+#include "fixed_point.h"
+#endif // FIXED_POINT_POSITION
+
 /** This kernel reshapes the tensor's low three dimensions to single column
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
  *
- * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: F16/F32
  * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -35,13 +39,13 @@
  * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                         src_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Same as input
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Same as @p src_ptr
  * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  bias_ptr                           Pointer to the bias tensor. Same as input
+ * @param[in]  bias_ptr                           Pointer to the bias tensor. Same as @p src_ptr
  * @param[in]  bias_stride_x                      Stride of the bias tensor in X dimension (in bytes)
  * @param[in]  bias_step_x                        bias_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  bias_offset_first_element_in_bytes The offset of the first element in the source tensor
@@ -53,9 +57,9 @@
 __kernel void reshape_to_columns(
     TENSOR3D_DECLARATION(src),
     IMAGE_DECLARATION(dst),
-#if defined HAS_BIAS
+#ifdef HAS_BIAS
     VECTOR_DECLARATION(bias),
-#endif
+#endif /* HAS_BIAS */
     uint width, uint height, uint depth, uint total_filters)
 {
     Tensor3D src            = CONVERT_TO_TENSOR3D_STRUCT(src);
@@ -64,9 +68,9 @@
     __global uchar *tmp_src_ptr = src.ptr;
     __global uchar *tmp_dst_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_y + get_global_id(1) * width * dst_stride_y + get_global_id(
                                       2) * width * height * dst_stride_y;
-#if defined         HAS_BIAS
+#ifdef HAS_BIAS
     __global uchar *tmp_bias_ptr = bias_ptr + bias_offset_first_element_in_bytes;
-#endif
+#endif /* HAS_BIAS */
 
     if(is_last_thread)
     {
@@ -74,10 +78,10 @@
         {
             *((__global DATA_TYPE *)tmp_dst_ptr) = *((__global DATA_TYPE *)tmp_src_ptr);
 
-#if defined HAS_BIAS
+#ifdef HAS_BIAS
             *((__global DATA_TYPE *)(tmp_dst_ptr + dst_stride_y)) = *((__global DATA_TYPE *)(tmp_bias_ptr));
             tmp_bias_ptr += bias_stride_x;
-#endif
+#endif /* HAS_BIAS */
             tmp_src_ptr += depth * src_stride_z;
             tmp_dst_ptr += dst_stride_x;
         }
@@ -93,12 +97,13 @@
     }
 }
 
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_X) && defined(PAD_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
 /** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -106,75 +111,156 @@
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: F16, F32
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  kernel_size                       The convolution kernel size
- * @param[in]  kernel_depth                      The kernel depth
- * @param[in]  width                             The output tensor width
- * @param[in]  input_dims                        The input tensor dimensions
- * @param[in]  strides                           The strides of the im2col operation
- * @param[in]  paddings                          The input tensor paddings
+ * @param[in]  filter_depth                      The depth of the used filter
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
  */
 __kernel void im2col_generic(
     TENSOR3D_DECLARATION(src),
     IMAGE_DECLARATION(dst),
-    int  kernel_size,
-    int  kernel_depth,
-    int  width,
-    int2 input_dims,
-    int2 strides,
-    int2 paddings)
+    uint filter_depth,
+    uint src_stride_w,
+    uint dst_stride_w)
 {
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Image    dst = CONVERT_TO_IMAGE_STRUCT_NO_STEP(dst);
+    const int xc    = get_global_id(0);                // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);                // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % filter_depth; // input feature map
+    const int batch = get_global_id(2) / filter_depth; // the batch
 
-    // Determine output index
-    uint     idx               = (get_global_id(1) * width + get_global_id(0)) * dst.stride_y;
-    __global uchar *output_ptr = dst.ptr + idx;
+    // Calculate input indeces
+    const int xi = xc * STRIDE_X - PAD_X;
+    const int yi = yc * STRIDE_Y - PAD_Y;
 
-    // Determine current input index
-    const int top_left_x = get_global_id(0) * strides.x - paddings.x;
-    const int top_left_y = get_global_id(1) * strides.y - paddings.y;
+    // Calculate output indeces
+    const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    __global uchar *input_ptr      = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
 
     // Linearize convolution elements
-    for(int d = 0; d < kernel_depth; ++d)
+    for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
     {
-        for(int y = top_left_y, y_e = top_left_y + kernel_size; y < y_e; ++y)
+        for(int x = xi, x_e = xi + KERNEL_WIDTH; x < x_e; ++x, ++output_ptr)
         {
-            for(int x = top_left_x, x_e = top_left_x + kernel_size; x < x_e; ++x, output_ptr += dst.stride_x)
+#if PAD_X == 0 && PAD_Y == 0
+            *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+#else  // PAD_X == 0 && PAD_Y == 0
+            if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
             {
-                if(x < 0 || x >= input_dims.x || y < 0 || y >= input_dims.y)
-                {
-                    *((__global DATA_TYPE *)output_ptr) = 0;
-                }
-                else
-                {
-                    *((__global DATA_TYPE *)output_ptr) = *((__global DATA_TYPE *)(tensor3D_offset(&src, x, y, d)));
-                }
+                *output_ptr = 0;
             }
+            else
+            {
+                *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+            }
+#endif // PAD_X == 0 && PAD_Y == 0
         }
     }
 
-#if defined HAS_BIAS
-    *((__global DATA_TYPE *)output_ptr) = 1;
-#endif
+#ifdef HAS_BIAS
+    if(ch == (KERNEL_DEPTH - 1))
+    {
+#ifdef FIXED_POINT_POSITION
+        *output_ptr = (DATA_TYPE)(1 << FIXED_POINT_POSITION);
+#else  // FIXED_POINT_POSITION
+        *output_ptr       = 1.0f;
+#endif // FIXED_POINT_POSITION
+    }
+#endif // HAS_BIAS
 }
 
-/** This kernel performs a reshaping of the output of the convolution layer.
+/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 3x3 and pad_x = pad_y = 0
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: F16, F32
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  filter_depth                      The depth of the used filter
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col_kernel3x3_padx0_pady0(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint filter_depth,
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int xc    = get_global_id(0);                // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);                // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % filter_depth; // input feature map
+    const int batch = get_global_id(2) / filter_depth; // the batch
+
+    // Calculate input indeces
+    const int xi = xc * STRIDE_X;
+    const int yi = yc * STRIDE_Y;
+
+    // Calculate output indeces
+    const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    // Get input and output address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
+
+    __global DATA_TYPE *output_ptr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w) + xo;
+
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    row1 = vload3(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    row2 = vload3(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
+
+    vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, output_ptr);
+    *(output_ptr + 8) = row2.s2;
+
+#ifdef HAS_BIAS
+    if(ch == (KERNEL_DEPTH - 1))
+    {
+#ifdef FIXED_POINT_POSITION
+        *(output_ptr + 9) = (DATA_TYPE)(1 << FIXED_POINT_POSITION);
+#else  // FIXED_POINT_POSITION
+        *(output_ptr + 9) = 1.0f;
+#endif // FIXED_POINT_POSITION
+    }
+#endif // HAS_BIAS
+}
+#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_X) && defined(PAD_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
+
+#if defined(WIDTH_OUTPUT)
+/** This kernel performs a reshaping of the output of the convolution layer.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
@@ -182,27 +268,30 @@
  * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  width                             The output tensor width
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
  */
 __kernel void col2im(
-    IMAGE_DECLARATION(src),
+    TENSOR3D_DECLARATION(src),
     TENSOR3D_DECLARATION(dst),
-    uint width)
+    uint dst_stride_w)
 {
-    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
     Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(dst);
 
-    int      idx                         = get_global_id(0) * dst.stride_z + (get_global_id(1) / width) * dst.stride_y + (get_global_id(1) % width) * dst.stride_x;
-    __global uchar *tmp_out_ptr          = dst.ptr + idx;
-    *((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)(src.ptr));
+    // Compute output offset
+    int idx = get_global_id(0) * dst.stride_z + (get_global_id(1) / WIDTH_OUTPUT) * dst_stride_y + (get_global_id(1) % WIDTH_OUTPUT) * dst_stride_x + get_global_id(2) * dst_stride_w;
+
+    // Store value
+    *((__global DATA_TYPE *)(dst.ptr + idx)) = *((__global DATA_TYPE *)(src.ptr));
 }
+#endif // defined(WIDTH_OUTPUT)
 
 /** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
  * @note In case biases will be added in late stage, -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -210,7 +299,7 @@
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Same as input.
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
@@ -230,12 +319,16 @@
 
     *((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)src.ptr);
 
-#if defined HAS_BIAS
+#ifdef HAS_BIAS
     // If it is the last thread in the 3 dimensional workgroup
     if(get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1))
     {
         tmp_out_ptr += dst_stride_x;
+#ifdef FIXED_POINT_POSITION
+        *((__global DATA_TYPE *)tmp_out_ptr) = (DATA_TYPE)(1 << FIXED_POINT_POSITION);
+#else  // FIXED_POINT_POSITION
         *((__global DATA_TYPE *)tmp_out_ptr) = (DATA_TYPE)1;
+#endif // FIXED_POINT_POSITION
     }
-#endif
+#endif // HAS_BIAS
 }

diff --git a/src/core/CL/cl_kernels/convolution_rectangle.cl b/src/core/CL/cl_kernels/convolution_rectangle.cl
index 96b9cff..f5a109f 100644
--- a/src/core/CL/cl_kernels/convolution_rectangle.cl
+++ b/src/core/CL/cl_kernels/convolution_rectangle.cl

@@ -31,15 +31,15 @@
 
 #ifndef DATA_TYPE
 #define DATA_TYPE short
-#endif
+#endif /* DATA_TYPE */
 
 #ifndef COMPUTE_TYPE
 #define COMPUTE_TYPE int
-#endif
+#endif /* COMPUTE_TYPE */
 
 #ifndef DATA_TYPE_OUT
 #define DATA_TYPE_OUT uchar
-#endif
+#endif /* DATA_TYPE_OUT */
 
 #ifndef DYNAMIC_MATRIX_CONVOLUTION
 
@@ -89,24 +89,24 @@
 #if MATRIX_WIDTH == 3
         pixels += convolution1x3(offset(&src, -1, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 3], matrix_coeff[1 + i * 3],
                                  matrix_coeff[2 + i * 3]);
-#endif
+#endif /* MATRIX_WIDTH */
 
 #if MATRIX_WIDTH == 5
         pixels += convolution1x5(offset(&src, -2, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 5], matrix_coeff[1 + i * 5],
                                  matrix_coeff[2 + i * 5], matrix_coeff[3 + i * 5], matrix_coeff[4 + i * 5]);
-#endif
+#endif /* MATRIX_WIDTH */
 
 #if MATRIX_WIDTH == 7
         pixels += convolution1x7(offset(&src, -3, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 7], matrix_coeff[1 + i * 7],
                                  matrix_coeff[2 + i * 7], matrix_coeff[3 + i * 7], matrix_coeff[4 + i * 7],
                                  matrix_coeff[5 + i * 7], matrix_coeff[6 + i * 7]);
-#endif
+#endif /* MATRIX_WIDTH */
 
 #if MATRIX_WIDTH == 9
         pixels += convolution1x9(offset(&src, -4, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 9], matrix_coeff[1 + i * 9],
                                  matrix_coeff[2 + i * 9], matrix_coeff[3 + i * 9], matrix_coeff[4 + i * 9],
                                  matrix_coeff[5 + i * 9], matrix_coeff[6 + i * 9], matrix_coeff[7 + i * 9], matrix_coeff[8 + i * 9]);
-#endif
+#endif /* MATRIX_WIDTH */
     }
 
     pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))SCALE;
@@ -115,4 +115,4 @@
     vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, ((__global DATA_TYPE_OUT *)dst.ptr));
 }
 
-#endif // DYNAMIC_MATRIX_CONVOLUTION
+#endif /* not DYNAMIC_MATRIX_CONVOLUTION */

diff --git a/src/core/CL/cl_kernels/depth_convert.cl b/src/core/CL/cl_kernels/depth_convert.cl
index c8eaa95..a9b7284 100644
--- a/src/core/CL/cl_kernels/depth_convert.cl
+++ b/src/core/CL/cl_kernels/depth_convert.cl

@@ -23,24 +23,47 @@
  */
 #include "helpers.h"
 
+#if defined(FIXED_POINT_POSITION)
+
+#include "fixed_point.h"
+
+#ifdef SATURATE
+#define CONVERT_DOWN(x, in_type, out_type, fixed_point_position) CONVERT_DOWN1_SAT(x, in_type, out_type, fixed_point_position)
+#define CONVERT_DOWN1_SAT(x, in_type, out_type, fixed_point_position) convert_##out_type##_##in_type##_sat(x, fixed_point_position)
+#else /* SATURATE */
+#define CONVERT_DOWN(x, in_type, out_type, fixed_point_position) CONVERT_DOWN1(x, in_type, out_type, fixed_point_position)
+#define CONVERT_DOWN1(x, in_type, out_type, fixed_point_position) convert_##out_type##_##in_type(x, fixed_point_position)
+#endif /* SATURATE */
+
+#define CONVERT_UP(x, in_type, out_type, fixed_point_position) CONVERT_UP1(x, in_type, out_type, fixed_point_position)
+#define CONVERT_UP1(x, in_type, out_type, fixed_point_position) convert_##out_type##_##in_type(x, fixed_point_position)
+
+#else /* FIXED_POINT_POSITION */
+
 #ifdef SATURATE
 #define CONVERT_DOWN(x, type) CONVERT_SAT(x, type)
-#else
+#else /* SATURATE */
 #define CONVERT_DOWN(x, type) CONVERT(x, type)
-#endif
+#endif /* SATURATE */
+
+#define CONVERT_UP(x, type) CONVERT(x, type)
+
+#endif /* FIXED_POINT_POSITION */
 
 /** This function performs a down-scaling depth conversion.
  *
  * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
  * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
  *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, U16, S16, U32 or S32
+ * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, U16, S16, U32, S32, F16, F32
  * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
  * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
  * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, U16, S16, U32 or S32
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: QS8, U8, QS16, U16, S16, U32, S32
  * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
  * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
@@ -60,7 +83,12 @@
     // Load data
     VEC_DATA_TYPE(DATA_TYPE_IN, 16)
     in_data = vload16(0, (__global DATA_TYPE_IN *)in.ptr);
+
+#if defined(FIXED_POINT_POSITION)
+    vstore16(CONVERT_DOWN(in_data, VEC_DATA_TYPE(DATA_TYPE_IN, 16), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), FIXED_POINT_POSITION), 0, (__global DATA_TYPE_OUT *)out.ptr);
+#else  /* FIXED_POINT_POSITION */
     vstore16(CONVERT_DOWN(in_data >> shift, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+#endif /* FIXED_POINT_POSITION */
 }
 
 /** This function performs a up-scaling depth conversion.
@@ -68,13 +96,15 @@
  * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
  * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
  *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, U16, S16, U32 or S32
+ * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, QS8, U16, S16, QS16, U32 or S32
  * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
  * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
  * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, U16, S16, U32 or S32
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, U16, S16, U32, S32, F16 or F32
  * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
  * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
@@ -92,7 +122,12 @@
     Image out = CONVERT_TO_IMAGE_STRUCT(out);
 
     // Load data
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    in_data = CONVERT(vload16(0, (__global DATA_TYPE_IN *)in.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
-    vstore16(in_data << shift, 0, (__global DATA_TYPE_OUT *)out.ptr);
+    VEC_DATA_TYPE(DATA_TYPE_IN, 16)
+    in_data = vload16(0, (__global DATA_TYPE_IN *)in.ptr);
+
+#if defined(FIXED_POINT_POSITION)
+    vstore16(CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_IN, 16), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), FIXED_POINT_POSITION), 0, (__global DATA_TYPE_OUT *)out.ptr);
+#else  /* FIXED_POINT_POSITION */
+    vstore16(CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)) << shift, 0, (__global DATA_TYPE_OUT *)out.ptr);
+#endif /* FIXED_POINT_POSITION */
 }

diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
new file mode 100644
index 0000000..9c2c3a5
--- /dev/null
+++ b/src/core/CL/cl_kernels/depthwise_convolution.cl

@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#if defined(CONV_STRIDE_X)
+
+#if CONV_STRIDE_X == 1
+#define convolution1x3 convolution1x3_stride_1
+#elif CONV_STRIDE_X == 2
+#define convolution1x3 convolution1x3_stride_2
+#elif CONV_STRIDE_X == 3
+#define convolution1x3 convolution1x3_stride_3
+#else /* CONV_STRIDE_X */
+#error "Stride not supported"
+#endif /* CONV_STRIDE_X */
+
+/** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
+ *
+ * @param[in] left_pixel   Pointer to the left pixel.
+ * @param[in] left_coeff   Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right_coeff  Weight of the right pixel
+ *
+ * @return a float2 containing 2 convoluted values.
+ */
+inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
+                                      const float           left_coeff,
+                                      const float           middle_coeff,
+                                      const float           right_coeff)
+{
+    float4 temp = vload4(0, (__global float *)left_pixel);
+
+    float2 left   = CONVERT(temp.s01, float2);
+    float2 middle = CONVERT(temp.s12, float2);
+    float2 right  = CONVERT(temp.s23, float2);
+
+    return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
+}
+
+/** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
+ *
+ * @param[in] left_pixel   Pointer to the left pixel.
+ * @param[in] left_coeff   Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right_coeff  Weight of the right pixel
+ *
+ * @return a float2 containing 2 convoluted values.
+ */
+inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
+                                      const float           left_coeff,
+                                      const float           middle_coeff,
+                                      const float           right_coeff)
+{
+    float4 temp0 = vload4(0, (__global float *)left_pixel);
+    float  temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
+
+    float2 left   = CONVERT(temp0.s02, float2);
+    float2 middle = CONVERT(temp0.s13, float2);
+    float2 right  = CONVERT((float2)(temp0.s2, temp1), float2);
+
+    return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
+}
+
+/** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
+ *
+ * @param[in] left_pixel   Pointer to the left pixel.
+ * @param[in] left_coeff   Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right_coeff  Weight of the right pixel
+ *
+ * @return a float2 containing 2 convoluted values.
+ */
+inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
+                                      const float           left_coeff,
+                                      const float           middle_coeff,
+                                      const float           right_coeff)
+{
+    float4 temp0 = vload4(0, (__global float *)left_pixel);
+    float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
+
+    float2 left   = CONVERT(temp0.s03, float2);
+    float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
+    float2 right  = CONVERT((float2)(temp0.s2, temp1.s1), float2);
+
+    return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
+}
+
+/** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
+ *
+ * Convolution matrix layout:
+ *
+ * [ mat0, mat1, mat2 ]\n
+ * [ mat3, mat4, mat5 ]\n
+ * [ mat6, mat7, mat8 ]\n
+ *
+ * @param[in] src  A pointer to source Image structure
+ * @param[in] mat0 Coefficient from the convolution matrix
+ * @param[in] mat1 Coefficient from the convolution matrix
+ * @param[in] mat2 Coefficient from the convolution matrix
+ * @param[in] mat3 Coefficient from the convolution matrix
+ * @param[in] mat4 Coefficient from the convolution matrix
+ * @param[in] mat5 Coefficient from the convolution matrix
+ * @param[in] mat6 Coefficient from the convolution matrix
+ * @param[in] mat0 Coefficient from the convolution matrix
+ * @param[in] mat7 Coefficient from the convolution matrix
+ * @param[in] mat8 Coefficient from the convolution matrix
+ *
+ * @return a float2 containing 2 convoluted values.
+ */
+inline float2 convolution3x3(
+    Image      *src,
+    const float mat0, const float mat1, const float mat2,
+    const float mat3, const float mat4, const float mat5,
+    const float mat6, const float mat7, const float mat8)
+{
+    float2 pixels;
+
+    pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2);
+    pixels += convolution1x3(offset(src, 0, 1), mat3, mat4, mat5);
+    pixels += convolution1x3(offset(src, 0, 2), mat6, mat7, mat8);
+
+    return pixels;
+}
+
+/** This function computes the horizontal integral of the image.
+  *
+  * @param[in] src_ptr                               Pointer to the source image. Supported data types: U8
+  * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
+  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+  * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
+  * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+  * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
+  * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+  * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+  * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F16/F32
+  * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+  * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+  * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+  * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
+  * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+  * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+  * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+  * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F16/F32
+  * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+  * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+  * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+  * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+  * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+  * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
+  * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+  */
+
+__kernel void depthwise_convolution_3x3(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst), TENSOR3D_DECLARATION(weights))
+{
+    Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+
+    uchar3 offset          = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
+    float3 weights_values0 = vload3(0, (__global float *)(weights.ptr + offset.s0));
+    float3 weights_values1 = vload3(0, (__global float *)(weights.ptr + offset.s1));
+    float3 weights_values2 = vload3(0, (__global float *)(weights.ptr + offset.s2));
+
+    float2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
+                                   weights_values1.s0, weights_values1.s1, weights_values1.s2,
+                                   weights_values2.s0, weights_values2.s1, weights_values2.s2);
+
+    vstore2(pixels, 0, (__global float *)dst.ptr);
+}
+
+#endif //defined(CONV_STRIDE_X)
+
+#if defined(SRC_WIDTH) && defined(DATA_TYPE)
+/** This kernel reshapes each of the tensor's low three dimensions to single rows.
+ *
+ * @note Datatype and source width should be given as a preprocessor argument using -DDATA_TYPE=type and -DSRC_WIDTH=width. e.g. -DSRC_WIDTH=128
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void depthwise_weights_reshape(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+    __global DATA_TYPE *input_ptr = (__global DATA_TYPE *)src.ptr;
+    __global uchar *output_ptr    = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * SRC_WIDTH * dst_stride_x + get_global_id(2) * dst_stride_y;
+
+    for(int i = 0; i < SRC_WIDTH; ++i, ++input_ptr)
+    {
+        *((__global DATA_TYPE *)(output_ptr + i * dst_stride_x)) = *input_ptr;
+    }
+}
+#endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
+
+#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_X) && defined(PAD_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
+/** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_X, -DPAD_Y, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+
+__kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    const int src_pixel_linear = get_global_id(1) * STRIDE_X;
+    const int full_length      = SRC_WIDTH + 2 * PAD_X;
+    const int max_initial_x    = STRIDE_X * (((full_length - KERNEL_WIDTH) / STRIDE_X) + 1);
+
+    const int src_x = -PAD_X + src_pixel_linear % max_initial_x;
+    const int src_y = -PAD_Y + src_pixel_linear / max_initial_x * STRIDE_Y;
+    const int src_z = get_global_id(2);
+
+    __global uchar *input_ptr      = src_ptr + src_offset_first_element_in_bytes + src_z * src_stride_z;
+    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));
+
+    for(int y = src_y; y < src_y + KERNEL_HEIGHT; ++y)
+    {
+        for(int x = src_x; x < src_x + KERNEL_WIDTH; ++x, ++output_ptr)
+        {
+            if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
+            {
+                *output_ptr = 0;
+            }
+            else
+            {
+                *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+            }
+        }
+    }
+}
+
+#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_X) && defined(PAD_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE)
+
+#if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
+
+/** This kernel performs a reshaping of the output of the depthwise generic convolution.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The convolution information must be passed at compile time using -DCONV_WIDTH, -DCONV_HEIGHT, e.g -DCONV_WIDTH=32, -DCONV_HEIGHT=42
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void depthwise_vector_to_tensor(
+    VECTOR_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Vector src = CONVERT_TO_VECTOR_STRUCT(src);
+
+    const int patch_size = CONV_WIDTH * CONV_HEIGHT;
+    const int id0        = get_global_id(0);
+    const int z          = id0 / patch_size;
+    const int index2D    = id0 - z * patch_size;
+
+    __global uchar *out_ptr          = dst_ptr + dst_offset_first_element_in_bytes + index2D % CONV_WIDTH * dst_stride_x + index2D / CONV_WIDTH * dst_stride_y + z * dst_stride_z;
+    *((__global DATA_TYPE *)out_ptr) = *((__global DATA_TYPE *)src.ptr);
+}
+
+#endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)

diff --git a/src/core/CL/cl_kernels/dequantization_layer.cl b/src/core/CL/cl_kernels/dequantization_layer.cl
new file mode 100644
index 0000000..21e9c87
--- /dev/null
+++ b/src/core/CL/cl_kernels/dequantization_layer.cl

@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This performs the dequantization of 8-bit unsigned integers to floating point.
+ *
+ * @param[in]  input_ptr                             Pointer to the source image. Supported data types: QS8/QS16/F16/F32
+ * @param[in]  input_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source image
+ * @param[out] output_ptr                            Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination image
+ * @param[in]  min_max_ptr                           Pointer to the min/max vector. Minimum value in position 0, maximum value in position 1. Suppported data types: F32.
+ * @param[in]  min_max_stride_x                      Stride of the min/max vector in X dimension (in bytes)
+ * @param[in]  min_max_step_x                        min_max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  min_max_offset_first_element_in_bytes The offset of the first element in the min/max vector
+ */
+__kernel void dequantization_layer(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    VECTOR_DECLARATION(min_max))
+{
+    // Get pixels pointer
+    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Vector   min_max = CONVERT_TO_VECTOR_STRUCT(min_max);
+
+    // min_max_value.s0 = min, min_max_value.s1 = max
+    const float2 min_max_value = vload2(0, (__global float *)min_max.ptr);
+
+    const float4 vmin  = (float4)min_max_value.s0;
+    const float4 scale = (float4)((min_max_value.s1 - min_max_value.s0) / 255.0f);
+
+    // Load data
+    const uchar4 data = vload4(0, (__global uchar *)input.ptr);
+
+    // Dequantize
+    const float4 res = convert_float4(data) * scale + vmin;
+
+    // Store result
+    vstore4(res, 0, (__global float *)output.ptr);
+}

diff --git a/src/core/CL/cl_kernels/derivative.cl b/src/core/CL/cl_kernels/derivative.cl
index 0e810d2..cd2091e 100644
--- a/src/core/CL/cl_kernels/derivative.cl
+++ b/src/core/CL/cl_kernels/derivative.cl

@@ -52,29 +52,29 @@
 #ifdef GRAD_X
     ,
     IMAGE_DECLARATION(dst_gx)
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     ,
     IMAGE_DECLARATION(dst_gy)
-#endif
+#endif /* GRAD_Y */
 )
 {
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
 #ifdef GRAD_X
     Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif
+#endif /* GRAD_Y */
 
 #ifdef GRAD_X
     short16 l_data = convert_short16(vload16(0, offset(&src, -1, 0)));
     short16 r_data = convert_short16(vload16(0, offset(&src, 1, 0)));
     vstore16(r_data - l_data, 0, ((__global short *)dst_gx.ptr));
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     short16 t_data = convert_short16(vload16(0, offset(&src, 0, -1)));
     short16 b_data = convert_short16(vload16(0, offset(&src, 0, 1)));
     vstore16(b_data - t_data, 0, ((__global short *)dst_gy.ptr));
-#endif
+#endif /* GRAD_Y */
 }

diff --git a/src/core/CL/cl_kernels/direct_convolution1x1.cl b/src/core/CL/cl_kernels/direct_convolution1x1.cl
new file mode 100644
index 0000000..fb516dd
--- /dev/null
+++ b/src/core/CL/cl_kernels/direct_convolution1x1.cl

@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(FIXED_POINT_POSITION)
+#include "fixed_point.h"
+
+#define ADD_OP(a, b) ADD_SAT_OP_EXPAND((a), (b), DATA_TYPE_PROMOTED, 8)
+#define MUL_OP(a, b) MUL_SAT_OP_EXPAND(CONVERT((a), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)), CONVERT((b), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)), DATA_TYPE_PROMOTED, 8, FIXED_POINT_POSITION)
+
+// There is no need to have a larger intermediate type for qs32 because all the arguments are already promoted
+MULQ_SAT_IMPL(qs32x8, qs32x8)
+
+#else /* FIXED_POINT_POSITION */
+#undef CONVERT_SAT
+
+#define ADD_OP(a, b) ((a) + (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define CONVERT_SAT(a, b) ((a))
+
+#endif /* FIXED_POINT_POSITION */
+
+#if defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if STRIDE_X == 3
+#define INPUT_PIXEL_STR(data_size) extract_input_stride3_##data_size
+#define INPUT_PIXEL(data_size) INPUT_PIXEL_STR(data_size)
+#elif STRIDE_X == 2
+#define INPUT_PIXEL(data_size) extract_input_stride2
+#elif STRIDE_X == 1
+#define INPUT_PIXEL(data_size) extract_input_stride1
+#else /* STRIDE_X not equals 1, 2 or 3 */
+#error "Only support strides 1, 2 and 3"
+#endif /* STRIDE_X == 3 */
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 1.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input pixels.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_pixel)
+{
+    return vload8(0, input_pixel);
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 2.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input pixels.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_pixel)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    temp = vload16(0, input_pixel);
+    return temp.s02468ace;
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 32-bit data size.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input pixels.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_32(__global const DATA_TYPE *input_pixel)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    temp1 = vload4(0, input_pixel);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    temp2 = vload4(0, input_pixel + 6);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    temp3 = vload4(0, input_pixel + 12);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    temp4 = vload4(0, input_pixel + 18);
+    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s03, temp2.s03, temp3.s03, temp4.s03);
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 16-bit data size.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input pixels.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_16(__global const DATA_TYPE *input_pixel)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp1 = vload8(0, input_pixel);
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp2 = vload8(0, input_pixel + 8);
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp3 = vload8(0, input_pixel + 16);
+    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s036, temp2.s147, temp3.s25);
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input pixels.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_8(__global const DATA_TYPE *input_pixel)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    temp1 = vload16(0, input_pixel);
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    temp2 = vload16(0, input_pixel + 12);
+    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);
+}
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
+ * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p weights_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution1x1(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* defined(HAS_BIAS) */
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
+    pixels = 0;
+
+    const uint z_index = get_global_id(2);
+
+    weights.ptr += z_index * weights_stride_w;
+
+    for(int d = 0; d < WEIGHTS_DEPTH; ++d)
+    {
+        DATA_TYPE weight = *(__global DATA_TYPE *)weights.ptr;
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        input_pixel = INPUT_PIXEL(DATA_SIZE)((__global DATA_TYPE *)src.ptr);
+        pixels      = ADD_OP(pixels, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))weight, input_pixel));
+        src.ptr += src_stride_z;
+        weights.ptr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+    pixels = ADD_OP(pixels, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, z_index))));
+#endif /* defined(HAS_BIAS) */
+
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if defined(WEIGHTS_DEPTH)
+
+#define CONVOLUTION1x1_BIFROST(acc, src, weight_value) \
+    ({                                                 \
+        acc.s0 = mad(src.s0, weight_value, acc.s0);    \
+        acc.s1 = mad(src.s1, weight_value, acc.s1);    \
+        acc.s2 = mad(src.s2, weight_value, acc.s2);    \
+        acc.s3 = mad(src.s3, weight_value, acc.s3);    \
+    })
+
+/** An optimized direct convolution 1x1 OpenCL kernel for Bifrost architectures when the data type is F32
+ *
+ * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note In case biases, -DHAS_BIAS must to be passed at compile
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p weights_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution1x1_f32_bifrost(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    // Get the kernel index
+    const int kernel_index = get_global_id(2);
+
+    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    float4 acc0 = 0.0f;
+    float4 acc1 = 0.0f;
+    float4 acc2 = 0.0f;
+    float4 acc3 = 0.0f;
+
+    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
+
+    for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
+    {
+        // Load the weights
+        float weight = *((__global float *)weights_addr);
+
+        // Load values from row0 of input tensor
+        float4 src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
+        float4 src1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
+        float4 src2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
+        float4 src3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+
+        CONVOLUTION1x1_BIFROST(acc0, src0, weight);
+        CONVOLUTION1x1_BIFROST(acc1, src1, weight);
+        CONVOLUTION1x1_BIFROST(acc2, src2, weight);
+        CONVOLUTION1x1_BIFROST(acc3, src3, weight);
+
+        src_addr += src_stride_z;
+        weights_addr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));
+
+    acc0.s0 += bias;
+    acc0.s1 += bias;
+    acc0.s2 += bias;
+    acc0.s3 += bias;
+    acc1.s0 += bias;
+    acc1.s1 += bias;
+    acc1.s2 += bias;
+    acc1.s3 += bias;
+    acc2.s0 += bias;
+    acc2.s1 += bias;
+    acc2.s2 += bias;
+    acc2.s3 += bias;
+    acc3.s0 += bias;
+    acc3.s1 += bias;
+    acc3.s2 += bias;
+    acc3.s3 += bias;
+#endif /* defined(HAS_BIAS) */
+
+    vstore4(acc0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+    vstore4(acc1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+    vstore4(acc2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
+    vstore4(acc3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
+}
+#endif // defined(WEIGHTS_DEPTH)
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/direct_convolution3x3.cl b/src/core/CL/cl_kernels/direct_convolution3x3.cl
new file mode 100644
index 0000000..d094eca
--- /dev/null
+++ b/src/core/CL/cl_kernels/direct_convolution3x3.cl

@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(FIXED_POINT_POSITION)
+#include "fixed_point.h"
+
+#define ADD_OP(a, b) ADD_SAT_OP_EXPAND((a), (b), DATA_TYPE_PROMOTED, 8)
+#define MUL_OP(a, b) MUL_SAT_OP_EXPAND(CONVERT((a), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)), CONVERT((b), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)), DATA_TYPE_PROMOTED, 8, FIXED_POINT_POSITION)
+
+// There is no need to have a larger intermediate type for qs32 because all the arguments are already promoted
+MULQ_SAT_IMPL(qs32x8, qs32x8)
+
+#else /* FIXED_POINT_POSITION */
+
+#undef CONVERT_SAT
+
+#define ADD_OP(a, b) ((a) + (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define CONVERT_SAT(a, b) ((a))
+
+#endif /* FIXED_POINT_POSITION */
+
+#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X == 2 */
+
+#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                                                  \
+    ({                                                                                                                                             \
+        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                                                                \
+        weights_values0 = vload3(0, weights_row_ptr);                                                                                              \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                                \
+        src0 = vload8(0, src_row_ptr);                                                                                                             \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                                                                                                                \
+        src1 = vload2(0, src_row_ptr + 8);                                                                                                         \
+        \
+        acc = ADD_OP(acc, MUL_OP(src0, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0));                                                          \
+        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \
+        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \
+    })
+
+#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                                               \
+    ({                                                                                                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                                                             \
+        weights_values0 = vload3(0, weights_row_ptr);                                                                                           \
+        VEC_DATA_TYPE(DATA_TYPE, 16)                                                                                                            \
+        src0           = vload16(0, src_row_ptr);                                                                                               \
+        DATA_TYPE src1 = *(src_row_ptr + 16);                                                                                                   \
+        \
+        acc = ADD_OP(acc, MUL_OP(src0.even, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0));                                                  \
+        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1));      \
+        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \
+    })
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note This OpenCL kernel works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p weights_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution3x3(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
+    pixels0 = 0;
+
+    __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
+
+    const int kernel_index = get_global_id(2);
+    weights_addr += kernel_index * weights_stride_w;
+
+    for(int d = 0; d < WEIGHTS_DEPTH; ++d)
+    {
+        CONVOLUTION1x3(pixels0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
+        CONVOLUTION1x3(pixels0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
+        CONVOLUTION1x3(pixels0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
+
+        src_addr += src_stride_z;
+        weights_addr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    pixels0 = ADD_OP(pixels0, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index))));
+#endif /* defined(HAS_BIAS) */
+
+    vstore8(CONVERT_SAT(pixels0, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif //defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if defined(WEIGHTS_DEPTH)
+
+#define CONVOLUTION1x3_BIFROST(acc, src0, src1, weights_row0) \
+    ({                                                        \
+        acc.s0 = mad(src0.s0, weights_row0.s0, acc.s0);       \
+        acc.s1 = mad(src0.s1, weights_row0.s0, acc.s1);       \
+        acc.s2 = mad(src0.s2, weights_row0.s0, acc.s2);       \
+        acc.s3 = mad(src0.s3, weights_row0.s0, acc.s3);       \
+        acc.s0 = mad(src0.s1, weights_row0.s1, acc.s0);       \
+        acc.s1 = mad(src0.s2, weights_row0.s1, acc.s1);       \
+        acc.s2 = mad(src0.s3, weights_row0.s1, acc.s2);       \
+        acc.s3 = mad(src1.s0, weights_row0.s1, acc.s3);       \
+        acc.s0 = mad(src0.s2, weights_row0.s2, acc.s0);       \
+        acc.s1 = mad(src0.s3, weights_row0.s2, acc.s1);       \
+        acc.s2 = mad(src1.s0, weights_row0.s2, acc.s2);       \
+        acc.s3 = mad(src1.s1, weights_row0.s2, acc.s3);       \
+    })
+
+/** An optimized direct convolution 3x3 OpenCL kernel for Bifrost architectures when the data type is F32
+ *
+ * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note In case biases, -DHAS_BIAS must to be passed at compile
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p weights_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution3x3_f32_bifrost(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    // Get the kernel index
+    const int kernel_index = get_global_id(2);
+
+    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    float4 pixels0 = 0;
+    float4 pixels1 = 0;
+    float4 pixels2 = 0;
+
+    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
+
+    // Note: Since each work-item computes 4x3 elements, we need to load 5 rows from the input tensor
+
+    for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
+    {
+        // Load the weights
+        float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
+        float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
+        float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
+        float4 src0;
+        float2 src1;
+
+        // Load values from row0 of input tensor
+        src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
+        src1 = vload2(0, (__global float *)(src_addr + 0 * src_stride_y) + 4);
+
+        CONVOLUTION1x3_BIFROST(pixels0, src0, src1, weights_row0);
+
+        // Load values from row1 of input tensor
+        src0 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
+        src1 = vload2(0, (__global float *)(src_addr + 1 * src_stride_y) + 4);
+
+        // Accumulate
+        CONVOLUTION1x3_BIFROST(pixels0, src0, src1, weights_row1);
+        CONVOLUTION1x3_BIFROST(pixels1, src0, src1, weights_row0);
+
+        // Load values from row2 of input tensor
+        src0 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
+        src1 = vload2(0, (__global float *)(src_addr + 2 * src_stride_y) + 4);
+
+        // Accumulate
+        CONVOLUTION1x3_BIFROST(pixels0, src0, src1, weights_row2);
+        CONVOLUTION1x3_BIFROST(pixels1, src0, src1, weights_row1);
+        CONVOLUTION1x3_BIFROST(pixels2, src0, src1, weights_row0);
+
+        // Load values from row3 of input tensor
+        src0 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+        src1 = vload2(0, (__global float *)(src_addr + 3 * src_stride_y) + 4);
+
+        // Accumulate
+        CONVOLUTION1x3_BIFROST(pixels1, src0, src1, weights_row2);
+        CONVOLUTION1x3_BIFROST(pixels2, src0, src1, weights_row1);
+
+        // Row4
+        src0 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y));
+        src1 = vload2(0, (__global float *)(src_addr + 4 * src_stride_y) + 4);
+
+        // Accumulate
+        CONVOLUTION1x3_BIFROST(pixels2, src0, src1, weights_row2);
+
+        src_addr += src_stride_z;
+        weights_addr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));
+
+    pixels0 += (float4)bias;
+    pixels1 += (float4)bias;
+    pixels2 += (float4)bias;
+#endif /* defined(HAS_BIAS) */
+
+    vstore4(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+    vstore4(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+    vstore4(pixels2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
+}
+#endif // defined(WEIGHTS_DEPTH)

diff --git a/src/core/CL/cl_kernels/direct_convolution5x5.cl b/src/core/CL/cl_kernels/direct_convolution5x5.cl
new file mode 100644
index 0000000..496da97
--- /dev/null
+++ b/src/core/CL/cl_kernels/direct_convolution5x5.cl

@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#undef CONVERT_SAT
+
+#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X == 2 */
+
+#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                               \
+    ({                                                                                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
+        weights_values0          = vload4(0, weights_row_ptr);                                                                  \
+        DATA_TYPE weights_value1 = *(weights_row_ptr + 4);                                                                      \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                             \
+        src0 = vload8(0, src_row_ptr);                                                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
+        src1 = vload4(0, src_row_ptr + 8);                                                                                      \
+        \
+        acc += src0 * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0;                                                          \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s345, src0.s67, src1.s012) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s45, src0.s67, src1.s0123) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1;     \
+    })
+
+#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                               \
+    ({                                                                                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
+        weights_values0          = vload4(0, weights_row_ptr);                                                                  \
+        DATA_TYPE weights_value1 = *(weights_row_ptr + 4);                                                                      \
+        VEC_DATA_TYPE(DATA_TYPE, 16)                                                                                            \
+        src0 = vload16(0, src_row_ptr);                                                                                         \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
+        src1 = vload4(0, src_row_ptr + 16);                                                                                     \
+        acc += src0.even * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0;                                                     \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1;         \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \
+        \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s3579, src0.sBDF, src1.s1) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s468a, src0.sCE, src1.s02) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1;     \
+    })
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p weights_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution5x5(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels0 = 0;
+
+    __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
+
+    const int kernel_index = get_global_id(2);
+    weights_addr += kernel_index * weights_stride_w;
+
+    for(int d = 0; d < WEIGHTS_DEPTH; ++d)
+    {
+        CONVOLUTION1x5(pixels0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr);
+        CONVOLUTION1x5(pixels0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
+        CONVOLUTION1x5(pixels0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
+        CONVOLUTION1x5(pixels0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));
+        CONVOLUTION1x5(pixels0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));
+
+        src_addr += src_stride_z;
+        weights_addr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    pixels0 += (VEC_DATA_TYPE(DATA_TYPE, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index)));
+#endif /* defined(HAS_BIAS) */
+
+    vstore8(pixels0, 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if defined(WEIGHTS_DEPTH)
+
+#define CONVOLUTION1x5_BIFROST(acc, src0, weights_row00, weights_row01) \
+    ({                                                                  \
+        acc.s0 = mad(src0.s0, weights_row00.s0, acc.s0);                \
+        acc.s1 = mad(src0.s1, weights_row00.s0, acc.s1);                \
+        acc.s2 = mad(src0.s2, weights_row00.s0, acc.s2);                \
+        acc.s3 = mad(src0.s3, weights_row00.s0, acc.s3);                \
+        acc.s0 = mad(src0.s1, weights_row00.s1, acc.s0);                \
+        acc.s1 = mad(src0.s2, weights_row00.s1, acc.s1);                \
+        acc.s2 = mad(src0.s3, weights_row00.s1, acc.s2);                \
+        acc.s3 = mad(src0.s4, weights_row00.s1, acc.s3);                \
+        acc.s0 = mad(src0.s2, weights_row00.s2, acc.s0);                \
+        acc.s1 = mad(src0.s3, weights_row00.s2, acc.s1);                \
+        acc.s2 = mad(src0.s4, weights_row00.s2, acc.s2);                \
+        acc.s3 = mad(src0.s5, weights_row00.s2, acc.s3);                \
+        acc.s0 = mad(src0.s3, weights_row00.s3, acc.s0);                \
+        acc.s1 = mad(src0.s4, weights_row00.s3, acc.s1);                \
+        acc.s2 = mad(src0.s5, weights_row00.s3, acc.s2);                \
+        acc.s3 = mad(src0.s6, weights_row00.s3, acc.s3);                \
+        acc.s0 = mad(src0.s4, weights_row01, acc.s0);                   \
+        acc.s1 = mad(src0.s5, weights_row01, acc.s1);                   \
+        acc.s2 = mad(src0.s6, weights_row01, acc.s2);                   \
+        acc.s3 = mad(src0.s7, weights_row01, acc.s3);                   \
+    })
+
+/** An optimized direct convolution 5x5 OpenCL kernel for Bifrost architectures when the data type is F32
+ *
+ * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p weights_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution5x5_f32_bifrost(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    // Get the kernel index
+    const int kernel_index = get_global_id(2);
+
+    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    float4 pixels0 = 0.0f;
+    float4 pixels1 = 0.0f;
+
+    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
+
+    // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
+
+    for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
+    {
+        // Load the weights from row0 and row1
+        float4 weights_row00 = vload4(0, (__global float *)(weights_addr + 0 * weights_stride_y));
+        float  weights_row01 = *((__global float *)(weights_addr + 0 * weights_stride_y) + 4);
+        float4 weights_row10 = vload4(0, (__global float *)(weights_addr + 1 * weights_stride_y));
+        float  weights_row11 = *((__global float *)(weights_addr + 1 * weights_stride_y) + 4);
+        float8 src0;
+
+        // Load values from row0 of input tensor
+        src0 = vload8(0, (__global float *)(src_addr + 0 * src_stride_y));
+
+        // Accumulate
+        CONVOLUTION1x5_BIFROST(pixels0, src0, weights_row00, weights_row01);
+
+        // Load values from row1 of input tensor
+        src0 = vload8(0, (__global float *)(src_addr + 1 * src_stride_y));
+
+        // Accumulate
+        CONVOLUTION1x5_BIFROST(pixels0, src0, weights_row10, weights_row11);
+        CONVOLUTION1x5_BIFROST(pixels1, src0, weights_row00, weights_row01);
+
+        // Load values from row2 of input tensor
+        src0 = vload8(0, (__global float *)(src_addr + 2 * src_stride_y));
+
+        // Load weights from row2
+        weights_row00 = vload4(0, (__global float *)(weights_addr + 2 * weights_stride_y));
+        weights_row01 = *((__global float *)(weights_addr + 2 * weights_stride_y) + 4);
+
+        // Accumulate
+        CONVOLUTION1x5_BIFROST(pixels0, src0, weights_row00, weights_row01);
+        CONVOLUTION1x5_BIFROST(pixels1, src0, weights_row10, weights_row11);
+
+        // Load values from row3 of input tensor
+        src0 = vload8(0, (__global float *)(src_addr + 3 * src_stride_y));
+
+        // Load weights from row3
+        weights_row10 = vload4(0, (__global float *)(weights_addr + 3 * weights_stride_y));
+        weights_row11 = *((__global float *)(weights_addr + 3 * weights_stride_y) + 4);
+
+        // Accumulate
+        CONVOLUTION1x5_BIFROST(pixels0, src0, weights_row10, weights_row11);
+        CONVOLUTION1x5_BIFROST(pixels1, src0, weights_row00, weights_row01);
+
+        // Load values from row4 of input tensor
+        src0 = vload8(0, (__global float *)(src_addr + 4 * src_stride_y));
+
+        // Load weights from row4
+        weights_row00 = vload4(0, (__global float *)(weights_addr + 4 * weights_stride_y));
+        weights_row01 = *((__global float *)(weights_addr + 4 * weights_stride_y) + 4);
+
+        CONVOLUTION1x5_BIFROST(pixels0, src0, weights_row00, weights_row01);
+        CONVOLUTION1x5_BIFROST(pixels1, src0, weights_row10, weights_row11);
+
+        // Load values from row5 of input tensor
+        src0 = vload8(0, (__global float *)(src_addr + 5 * src_stride_y));
+
+        // Accumulate
+        CONVOLUTION1x5_BIFROST(pixels1, src0, weights_row00, weights_row01);
+
+        src_addr += src_stride_z;
+        weights_addr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    float4 bias = (float4) * ((__global float *)(vector_offset(&biases, kernel_index)));
+
+    pixels0 += bias;
+    pixels1 += bias;
+#endif /* defined(HAS_BIAS) */
+
+    vstore4(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+    vstore4(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+}
+#endif // defined(WEIGHTS_DEPTH)

diff --git a/src/core/CL/cl_kernels/fast_corners.cl b/src/core/CL/cl_kernels/fast_corners.cl
index 470d14a..3e1929c 100644
--- a/src/core/CL/cl_kernels/fast_corners.cl
+++ b/src/core/CL/cl_kernels/fast_corners.cl

@@ -206,12 +206,11 @@
         return;
     }
 
-#ifndef USE_MAXSUPPRESSION
-    *out.ptr = 1;
-#else
-
+#ifdef USE_MAXSUPPRESSION
     *out.ptr = compute_strength(p, in.ptr, input_stride_y, threshold);
-#endif
+#else  /* USE_MAXSUPPRESSION */
+    *out.ptr = 1;
+#endif /* USE_MAXSUPPRESSION */
 }
 
 /** Copy result to Keypoint buffer and count number of corners
@@ -240,7 +239,7 @@
     {
         return;
     }
-#endif
+#endif /* UPDATE_NUMBER */
 
     Image in = CONVERT_TO_IMAGE_STRUCT(input);
 

diff --git a/src/core/CL/cl_kernels/fill_border.cl b/src/core/CL/cl_kernels/fill_border.cl
index df63586..fbd4f6a 100644
--- a/src/core/CL/cl_kernels/fill_border.cl
+++ b/src/core/CL/cl_kernels/fill_border.cl

@@ -23,6 +23,10 @@
  */
 #include "helpers.h"
 
+#if defined(FIXED_POINT_POSITION)
+#include "fixed_point.h"
+#endif /* FIXED_POINT_POSITION */
+
 /** Fill N pixel of the padding edge of a single channel image by replicating the closest valid pixel.
  *
  * @attention  The DATA_TYPE needs to be passed at the compile time.
@@ -36,18 +40,20 @@
  * @param[in]     buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]     buf_stride_y                      Stride of the source image in Y dimension (in bytes)
  * @param[in]     buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     buf_stride_z                      Stride between images if batching images (in bytes)
+ * @param[in]     buf_step_z                        buf_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]     buf_offset_first_element_in_bytes The offset of the first element in the source image
  * @param[in]     width                             Width of the valid region of the image
  * @param[in]     height                            Height of the valid region of the image
  * @param[in]     start_pos                         XY coordinate indicating the start point of the valid region
  */
 __kernel void fill_image_borders_replicate(
-    IMAGE_DECLARATION(buf),
+    TENSOR3D_DECLARATION(buf),
     uint width,
     uint height,
     int2 start_pos)
 {
-    Image buf = CONVERT_TO_IMAGE_STRUCT_NO_STEP(buf);
+    Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
 
     // Update pointer to point to the starting point of the valid region
     buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
@@ -109,6 +115,8 @@
  * @param[in]  buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  buf_stride_y                      Stride of the source image in Y dimension (in bytes)
  * @param[in]  buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  buf_stride_z                      Stride between images if batching images (in bytes)
+ * @param[in]  buf_step_z                        buf_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  buf_offset_first_element_in_bytes The offset of the first element in the source image
  * @param[in]  width                             Width of the valid region of the image
  * @param[in]  height                            Height of the valid region of the image
@@ -116,13 +124,13 @@
  * @param[in]  constant_value                    Constant value to use to fill the edges
  */
 __kernel void fill_image_borders_constant(
-    IMAGE_DECLARATION(buf),
+    TENSOR3D_DECLARATION(buf),
     uint      width,
     uint      height,
     int2      start_pos,
     DATA_TYPE constant_value)
 {
-    Image buf = CONVERT_TO_IMAGE_STRUCT_NO_STEP(buf);
+    Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
 
     // Update pointer to point to the starting point of the valid region
     buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;

diff --git a/src/core/CL/cl_kernels/fixed_point.h b/src/core/CL/cl_kernels/fixed_point.h
new file mode 100644
index 0000000..5476a6e
--- /dev/null
+++ b/src/core/CL/cl_kernels/fixed_point.h

@@ -0,0 +1,513 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_FIXED_POINT_H
+#define ARM_COMPUTE_FIXED_POINT_H
+
+#define TYPE_ALIAS(type, alias)  \
+    typedef type alias;          \
+    typedef type alias##x##1;    \
+    typedef type##2 alias##x##2; \
+    typedef type##3 alias##x##3; \
+    typedef type##4 alias##x##4; \
+    typedef type##8 alias##x##8; \
+    typedef type##16 alias##x##16;
+
+TYPE_ALIAS(char, qs8)
+TYPE_ALIAS(short, qs16)
+TYPE_ALIAS(int, qs32)
+
+#define qs8_MIN ((char)CHAR_MIN)
+#define qs8_MAX ((char)CHAR_MAX)
+#define qs16_MIN ((short)SHRT_MIN)
+#define qs16_MAX ((short)SHRT_MAX)
+#define qs32_MIN ((int)INT_MIN)
+#define qs32_MAX ((int)INT_MAX)
+
+#define qu8_MIN ((uchar)0)
+#define qu8_MAX ((uchar)UCHAR_MAX)
+#define qu16_MIN ((ushort)0)
+#define qu16_MAX ((ushort)USHRT_MAX)
+#define qu32_MIN ((uint)0)
+#define qu32_MAX ((uint)UINT_MAX)
+
+#define qs8_TYPE char
+#define qs8x1_TYPE char
+#define qs8x2_TYPE char2
+#define qs8x3_TYPE char3
+#define qs8x4_TYPE char4
+#define qs8x8_TYPE char8
+#define qs8x16_TYPE char16
+
+#define qs16_TYPE short
+#define qs16x1_TYPE short
+#define qs16x2_TYPE short2
+#define qs16x3_TYPE short3
+#define qs16x4_TYPE short4
+#define qs16x8_TYPE short8
+#define qs16x16_TYPE short16
+
+#define qs32_TYPE int
+#define qs32x1_TYPE int
+#define qs32x2_TYPE int2
+#define qs32x3_TYPE int3
+#define qs32x4_TYPE int4
+#define qs32x8_TYPE int8
+#define qs32x16_TYPE int16
+
+/* All internal constants are represented in the maximum supported fixed point format (QS16),
+ * thus we define an additional shift parameter required to convert the constant
+ * from the maximum supported format to the require one.
+ */
+#define qs8_SHIFT 8
+#define qs16_SHIFT 0
+
+#undef VEC_DATA_TYPE_STR
+#undef VEC_DATA_TYPE
+#undef CONVERT_STR
+#undef CONVERT
+#undef CONVERT_SAT_STR
+#undef CONVERT_SAT
+
+#define VEC_DATA_TYPE_STR(type, size) type##x##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR3(x, type, rtype) (convert_##rtype((x)))
+#define CONVERT_STR2(x, type, rtype) CONVERT_STR3(x, type, rtype)
+#define CONVERT_STR(x, type) CONVERT_STR2(x, type, type##_TYPE)
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR3(x, type, rtype) (convert_##rtype##_sat((x)))
+#define CONVERT_SAT_STR2(x, type, rtype) CONVERT_SAT_STR3(x, type, rtype)
+#define CONVERT_SAT_STR(x, type) CONVERT_SAT_STR2(x, type, type##_TYPE)
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+/** Computes saturating absolute value of fixed point vector.
+  *
+  * @param[in] type the actual data type.
+  *
+  * @return The result of the fixed point absolute value.
+  */
+#define ABSQ_SAT_IMPL(type)                  \
+    inline type abs_##type##_sat(type VopA)  \
+    {                                        \
+        return CONVERT_SAT(abs(VopA), type); \
+    }
+
+ABSQ_SAT_IMPL(qs8x16)
+ABSQ_SAT_IMPL(qs16x8)
+
+#define ABS_SAT_OP_EXPAND_STR(a, type, size) abs_##type##x##size##_sat((a))
+#define ABS_SAT_OP_EXPAND(a, type, size) ABS_SAT_OP_EXPAND_STR(a, type, size)
+
+/** Computes max of fixed point types.
+  *
+  * @param[in] type the actual data type.
+  *
+  * @return The result of the fixed point maximum.
+  */
+#define MAXQ_IMPL(type)                          \
+    inline type max_##type(type VopA, type VopB) \
+    {                                            \
+        return max(VopA, VopB);                  \
+    }
+
+MAXQ_IMPL(qs8x1)
+MAXQ_IMPL(qs8x2)
+MAXQ_IMPL(qs8x4)
+MAXQ_IMPL(qs8x8)
+MAXQ_IMPL(qs8x16)
+MAXQ_IMPL(qs16x1)
+MAXQ_IMPL(qs16x2)
+MAXQ_IMPL(qs16x4)
+MAXQ_IMPL(qs16x8)
+MAXQ_IMPL(qs16x16)
+
+#define MAX_OP_EXPAND_STR(a, b, type, size) max_##type##x##size((a), (b))
+#define MAX_OP_EXPAND(a, b, type, size) MAX_OP_EXPAND_STR(a, b, type, size)
+
+/** Computes saturated addition of fixed point types.
+  *
+  * @param[in] type the actual data type.
+  *
+  * @return The result of the fixed point addition. The result is saturated in case of overflow
+  */
+#define ADDQ_SAT_IMPL(type)                          \
+    inline type add_sat_##type(type VopA, type VopB) \
+    {                                                \
+        return add_sat(VopA, VopB);                  \
+    }
+
+ADDQ_SAT_IMPL(qs8x1)
+ADDQ_SAT_IMPL(qs8x2)
+ADDQ_SAT_IMPL(qs8x4)
+ADDQ_SAT_IMPL(qs8x8)
+ADDQ_SAT_IMPL(qs8x16)
+ADDQ_SAT_IMPL(qs16x1)
+ADDQ_SAT_IMPL(qs16x2)
+ADDQ_SAT_IMPL(qs16x4)
+ADDQ_SAT_IMPL(qs16x8)
+ADDQ_SAT_IMPL(qs16x16)
+ADDQ_SAT_IMPL(qs32x1)
+ADDQ_SAT_IMPL(qs32x2)
+ADDQ_SAT_IMPL(qs32x4)
+ADDQ_SAT_IMPL(qs32x8)
+ADDQ_SAT_IMPL(qs32x16)
+
+#define ADD_SAT_OP_EXPAND_STR(a, b, type, size) add_sat_##type##x##size((a), (b))
+#define ADD_SAT_OP_EXPAND(a, b, type, size) ADD_SAT_OP_EXPAND_STR(a, b, type, size)
+
+/** Computes saturated subtraction of fixed point types.
+  *
+  * @param[in] type the actual data type.
+  *
+  * @return The result of the fixed point subtraction. The result is saturated in case of overflow
+  */
+#define SUBQ_SAT_IMPL(type)                          \
+    inline type sub_sat_##type(type VopA, type VopB) \
+    {                                                \
+        return sub_sat(VopA, VopB);                  \
+    }
+
+SUBQ_SAT_IMPL(qs8x1)
+SUBQ_SAT_IMPL(qs8x2)
+SUBQ_SAT_IMPL(qs8x4)
+SUBQ_SAT_IMPL(qs8x8)
+SUBQ_SAT_IMPL(qs8x16)
+SUBQ_SAT_IMPL(qs16x1)
+SUBQ_SAT_IMPL(qs16x2)
+SUBQ_SAT_IMPL(qs16x4)
+SUBQ_SAT_IMPL(qs16x8)
+SUBQ_SAT_IMPL(qs16x16)
+
+#define SUB_SAT_OP_EXPAND_STR(a, b, type, size) sub_sat_##type##x##size((a), (b))
+#define SUB_SAT_OP_EXPAND(a, b, type, size) SUB_SAT_OP_EXPAND_STR(a, b, type, size)
+
+/* Multiply of two fixed point numbers
+ *
+ * @param[in] type  the actual data type.
+ * @param[in] itype the intermediate data type.
+ *
+ * @return The result of the fixed point multiplication.
+ */
+#define MULQ_IMPL(type, itype)                                                         \
+    inline type mul_##type(type VopA, type VopB, int fixed_point_position)             \
+    {                                                                                  \
+        itype round_val = (itype)(1 << (fixed_point_position - 1));                    \
+        itype res       = CONVERT((VopA), itype) * CONVERT((VopB), itype) + round_val; \
+        return CONVERT((res >> (itype)fixed_point_position), type);                    \
+    }
+
+MULQ_IMPL(qs8x8, qs16x8)
+MULQ_IMPL(qs16x8, qs32x8)
+MULQ_IMPL(qs8x16, qs16x16)
+MULQ_IMPL(qs16x16, qs32x16)
+
+#define MUL_OP_EXPAND_STR(a, b, type, size, position) mul_##type##x##size((a), (b), (position))
+#define MUL_OP_EXPAND(a, b, type, size, position) MUL_OP_EXPAND_STR(a, b, type, size, position)
+
+/* Saturate multiply of two fixed point numbers
+ *
+ * @param[in] type  the actual data type.
+ * @param[in] itype the intermediate data type.
+ *
+ * @return The result of the fixed point multiplication. The result is saturated in case of overflow
+ */
+#define MULQ_SAT_IMPL(type, itype)                                                            \
+    inline type mul_sat_##type(type VopA, type VopB, int fixed_point_position)                \
+    {                                                                                         \
+        itype round_val = (itype)(1 << (fixed_point_position - 1));                           \
+        itype res       = mad_sat(CONVERT((VopA), itype), CONVERT((VopB), itype), round_val); \
+        return CONVERT_SAT((res >> (itype)fixed_point_position), type);                       \
+    }
+
+MULQ_SAT_IMPL(qs8x1, qs16x1)
+MULQ_SAT_IMPL(qs8x2, qs16x2)
+MULQ_SAT_IMPL(qs8x3, qs16x3)
+MULQ_SAT_IMPL(qs8x4, qs16x4)
+MULQ_SAT_IMPL(qs8x8, qs16x8)
+MULQ_SAT_IMPL(qs8x16, qs16x16)
+MULQ_SAT_IMPL(qs16x1, qs32x1)
+MULQ_SAT_IMPL(qs16x2, qs32x2)
+MULQ_SAT_IMPL(qs16x3, qs32x3)
+MULQ_SAT_IMPL(qs16x4, qs32x4)
+MULQ_SAT_IMPL(qs16x8, qs32x8)
+MULQ_SAT_IMPL(qs16x16, qs32x16)
+
+#define MUL_SAT_OP_EXPAND_STR(a, b, type, size, position) mul_sat_##type##x##size((a), (b), (position))
+#define MUL_SAT_OP_EXPAND(a, b, type, size, position) MUL_SAT_OP_EXPAND_STR(a, b, type, size, position)
+
+/** Saturate multiply-accumulate
+  *
+  * @param[in] type  the actual data type.
+  * @param[in] itype the intermediate data type.
+  *
+  * @return The result of the fixed point multiply-accumulate. The result is saturated in case of overflow
+  */
+#define MLAQ_SAT_IMPL(type, itype)                                                                                 \
+    type mla_sat_##type(type VopA, type VopB, type VopC, int fixed_point_position)                                 \
+    {                                                                                                              \
+        itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype), (itype)(1 << (fixed_point_position - 1))); \
+        return add_sat(VopA, CONVERT_SAT(res >> (itype)fixed_point_position, type));                               \
+    }
+
+MLAQ_SAT_IMPL(qs8x8, qs16x8)
+MLAQ_SAT_IMPL(qs8x16, qs16x16)
+MLAQ_SAT_IMPL(qs16x8, qs32x8)
+
+#define MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position) mla_sat_##type##x##size((a), (b), (c), (position))
+#define MLA_SAT_OP_EXPAND(a, b, c, type, size, position) MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position)
+
+/** Saturate multiply-accumulate long
+  *
+  * @param[in] type  the actual data type.
+  * @param[in] itype the intermediate data type.
+  *
+  * @return The result of the fixed point multiply-accumulate long. The result is saturated in case of overflow
+  */
+#define MLALQ_SAT_IMPL(type, itype)                                                                                \
+    itype mlal_sat_##type(itype VopA, type VopB, type VopC, int fixed_point_position)                              \
+    {                                                                                                              \
+        itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype), (itype)(1 << (fixed_point_position - 1))); \
+        return add_sat(VopA, res >> (itype)fixed_point_position);                                                  \
+    }
+
+MLALQ_SAT_IMPL(qs8x8, qs16x8)
+MLALQ_SAT_IMPL(qs16x8, qs32x8)
+
+#define MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position) mlal_sat_##type##x##size((a), (b), (c), (position))
+#define MLAL_SAT_OP_EXPAND(a, b, c, type, size, position) MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position)
+
+/** Saturate division of two fixed point vectors
+  *
+  * @param[in] stype the actual scalar data type.
+  * @param[in] type  the actual data type.
+  * @param[in] itype the intermediate data type.
+  *
+  * @return The result of the fixed point division. The result is saturated in case of overflow
+  */
+#define DIVQ_SAT_IMPL(stype, type, itype)                                                                                                                                           \
+    inline type div_sat_##type(type VopA, type VopB, int fixed_point_position)                                                                                                      \
+    {                                                                                                                                                                               \
+        itype conv_a      = CONVERT((VopA), itype);                                                                                                                                 \
+        itype denominator = CONVERT((VopB), itype);                                                                                                                                 \
+        itype numerator   = conv_a << (itype)(fixed_point_position);                                                                                                                \
+        itype res         = select((itype)(numerator / denominator), select((itype)stype##_MAX, (itype)stype##_MIN, (itype)(conv_a < (itype)0)), (itype)(denominator == (itype)0)); \
+        return CONVERT_SAT((res), type);                                                                                                                                            \
+    }
+
+DIVQ_SAT_IMPL(qs8, qs8x16, qs16x16)
+DIVQ_SAT_IMPL(qs16, qs16x8, qs32x8)
+DIVQ_SAT_IMPL(qs16, qs16x16, qs32x16)
+DIVQ_SAT_IMPL(qs8, qs8, qs16)
+DIVQ_SAT_IMPL(qs16, qs16, qs32)
+
+#define DIV_SAT_OP_EXPAND_STR(a, b, type, position) div_sat_##type((a), (b), (position))
+#define DIV_SAT_OP_EXPAND(a, b, type, position) DIV_SAT_OP_EXPAND_STR(a, b, type, position)
+
+#define DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position) div_sat_##type##x##size((a), (b), (position))
+#define DIV_SAT_OP_VEC_EXPAND(a, b, type, size, position) DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position)
+
+/** Saturate exponential of a fixed point vector
+  *
+  * @note Implemented approach uses taylor polynomial to approximate the exponential function.
+  *
+  * @param[in] stype the actual scalar data type.
+  * @param[in] type  the actual data type.
+  * @param[in] size  the number of the calculated elements.
+  *
+  * @return The result of the fixed point exponential. The result is saturated in case of overflow
+  */
+#define EXPQ_IMPL(stype, type, size)                                                                                                              \
+    inline type exp_sat_##type(type VopA, int fixed_point_position)                                                                               \
+    {                                                                                                                                             \
+        type const_one = (type)(1 << (fixed_point_position));                                                                                     \
+        type ln2       = (type)((((0x58B9 >> (14 - fixed_point_position))) + 1) >> 1);                                                            \
+        type inv_ln2   = (type)((((0x38AA >> (14 - fixed_point_position)) + 1) >> 1)) | const_one;                                                \
+        type A         = (type)(((0x7FBA >> (14 - fixed_point_position)) + 1) >> 1);                                                              \
+        type B         = (type)(((0x3FE9 >> (14 - fixed_point_position)) + 1) >> 1);                                                              \
+        type C         = (type)(((0x1693 >> (14 - fixed_point_position)) + 1) >> 1);                                                              \
+        type D         = (type)(((0x0592 >> (14 - fixed_point_position)) + 1) >> 1);                                                              \
+        type m         = MUL_SAT_OP_EXPAND(VopA, inv_ln2, stype, size, fixed_point_position);                                                     \
+        type dec_m     = m >> (type)fixed_point_position;                                                                                         \
+        type alpha     = MUL_SAT_OP_EXPAND(dec_m << (type)fixed_point_position, ln2, stype, size, fixed_point_position);                          \
+        alpha          = CONVERT(abs_diff(VopA, alpha), type);                                                                                    \
+        type sum       = add_sat(MUL_SAT_OP_EXPAND(alpha, D, stype, size, fixed_point_position), C);                                              \
+        sum            = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), B);                                            \
+        sum            = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), A);                                            \
+        sum            = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), const_one);                                    \
+        return select((type)stype##_MAX, select(sum << dec_m, sum >> -dec_m, dec_m < (type)0), clz(sum) > dec_m); /* Saturate result if needed */ \
+    }
+
+EXPQ_IMPL(qs8, qs8x16, 16)
+EXPQ_IMPL(qs16, qs16x8, 8)
+EXPQ_IMPL(qs16, qs16x16, 16)
+
+#define EXP_OP_EXPAND_STR(a, type, size, position) exp_sat_##type##x##size((a), (position))
+#define EXP_OP_EXPAND(a, type, size, position) EXP_OP_EXPAND_STR(a, type, size, position)
+
+/** Saturate logarithm of a fixed point vector
+  *
+  * @note Implemented approach uses taylor polynomial to approximate the logarithm function.
+  *
+  * @param[in] stype the actual scalar data type.
+  * @param[in] type  the actual data type.
+  * @param[in] size  the number of the calculated elements.
+  *
+  * @return The result of the fixed point logarithm. The result is saturated in case of overflow
+  */
+#define LOGQ_IMPL(stype, type, size)                                                                                                       \
+    inline type log_sat_##type(type VopA, int fixed_point_position)                                                                        \
+    {                                                                                                                                      \
+        type const_one = (type)(1 << (fixed_point_position));                                                                              \
+        type ln2       = (type)(0x58B9 >> (15 - fixed_point_position));  /* 1.4384189 */                                                   \
+        type A         = (type)(0x5C0F >> (14 - fixed_point_position));  /* 1.4384189 */                                                   \
+        type B         = -(type)(0x56AE >> (15 - fixed_point_position)); /* -0.6771900 */                                                  \
+        type C         = (type)(0x2933 >> (15 - fixed_point_position));  /* 0.3218538 */                                                   \
+        type D         = -(type)(0x0AA7 >> (15 - fixed_point_position)); /* -0.0832229 */                                                  \
+        type inter_a   = select(VopA, DIV_SAT_OP_VEC_EXPAND(const_one, VopA, stype, size, fixed_point_position), VopA < const_one);        \
+        type shift_val = (type)(15 - stype##_SHIFT) - clz(inter_a >> (type)fixed_point_position);                                          \
+        inter_a        = inter_a >> shift_val;                                                                                             \
+        inter_a        = sub_sat(inter_a, const_one);                                                                                      \
+        type sum       = add_sat(MUL_SAT_OP_EXPAND(inter_a, D, stype, size, fixed_point_position), C);                                     \
+        sum            = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), B);                                   \
+        sum            = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), A);                                   \
+        sum            = MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position);                                               \
+        sum            = MUL_SAT_OP_EXPAND(add_sat(sum, shift_val << (type)fixed_point_position), ln2, stype, size, fixed_point_position); \
+        return select(select(sum, -sum, VopA < const_one), (type)0, VopA < (type)0); /* Saturate result if needed */                       \
+    }
+
+LOGQ_IMPL(qs8, qs8x16, 16)
+LOGQ_IMPL(qs16, qs16x8, 8)
+LOGQ_IMPL(qs16, qs16x16, 16)
+
+#define LOG_OP_EXPAND_STR(a, type, size, position) log_sat_##type##x##size((a), (position))
+#define LOG_OP_EXPAND(a, type, size, position) LOG_OP_EXPAND_STR(a, type, size, position)
+
+/** Saturate inverse square root of a fixed point vector
+  *
+  * @note Implemented approach uses Newton's method to approximate the inverse square root function.
+  *
+  * @param[in] stype the actual scalar data type.
+  * @param[in] type  the actual data type.
+  * @param[in] size  the number of the calculated elements.
+  *
+  * @return The result of the fixed point inverse square root. The result is saturated in case of overflow
+  */
+#define INVSQRTQ_IMPL(stype, type, size)                                                                                                                                                                                               \
+    inline type invsqrt_sat_##type(type VopA, int fixed_point_position)                                                                                                                                                                \
+    {                                                                                                                                                                                                                                  \
+        type const_three = (type)(3 << (fixed_point_position));                                                                                                                                                                        \
+        type shift_value = (type)(16 - stype##_SHIFT) - (clz(VopA) + (type)fixed_point_position);                                                                                                                                      \
+        type temp        = select((type)(VopA >> shift_value), select((type)stype##_MAX, (type)(VopA << (-shift_value)), (type)(clz(VopA) > (-shift_value))), (type)(shift_value < (type)0));                                          \
+        type x           = temp;                                                                                                                                                                                                       \
+        x                = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1; \
+        x                = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1; \
+        x                = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1; \
+        if(sizeof((stype)(1)) > 1) /* Perform more iterations if datatype is QS16 */                                                                                                                                                   \
+        {                                                                                                                                                                                                                              \
+            x = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1;            \
+            x = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1;            \
+        }                                                                                                                                                                                                                              \
+        type shift_value2 = select(shift_value >> 1, (-shift_value) >> 1, shift_value < (type)0);                                                                                                                                      \
+        return select((type)(x >> shift_value2), select((type)stype##_MAX, (type)(x << shift_value2), (type)(clz(x) > shift_value2)), (type)(shift_value < (type)0)); /* Saturate result if needed */                                  \
+    }
+
+INVSQRTQ_IMPL(qs8, qs8x1, 1)
+INVSQRTQ_IMPL(qs16, qs16x1, 1)
+INVSQRTQ_IMPL(qs8, qs8x16, 16)
+INVSQRTQ_IMPL(qs16, qs16x8, 8)
+
+#define INVSQRT_OP_EXPAND_STR(a, type, size, position) invsqrt_sat_##type##x##size((a), (position))
+#define INVSQRT_OP_EXPAND(a, type, size, position) INVSQRT_OP_EXPAND_STR(a, type, size, position)
+
+/** Saturate hyperbolic tangent of a fixed point vector
+  *
+  * tanh(x) = (e^2x - 1)/(e^2x + 1)
+  *
+  * @param[in] stype the actual scalar data type.
+  * @param[in] type  the actual data type.
+  * @param[in] size  the number of the calculated elements.
+  *
+  * @return The result of the fixed point hyperbolic tangent. The result is saturated in case of overflow
+  */
+#define TANHQ_IMPL(stype, type, size)                                                                                                             \
+    inline type tanh_sat_##type(type VopA, int fixed_point_position)                                                                              \
+    {                                                                                                                                             \
+        type const_one = (type)(1 << (fixed_point_position));                                                                                     \
+        type const_two = (type)(2 << (fixed_point_position));                                                                                     \
+        type exp2x     = EXP_OP_EXPAND(MUL_SAT_OP_EXPAND(const_two, VopA, stype, size, fixed_point_position), stype, size, fixed_point_position); \
+        type num       = SUB_SAT_OP_EXPAND(exp2x, const_one, stype, size);                                                                        \
+        type den       = ADD_SAT_OP_EXPAND(exp2x, const_one, stype, size);                                                                        \
+        return DIV_SAT_OP_VEC_EXPAND(num, den, stype, size, fixed_point_position);                                                                \
+    }
+
+TANHQ_IMPL(qs8, qs8x16, 16)
+TANHQ_IMPL(qs16, qs16x8, 8)
+
+#define TANH_OP_EXPAND_STR(a, type, size, position) tanh_sat_##type##x##size((a), (position))
+#define TANH_OP_EXPAND(a, type, size, position) TANH_OP_EXPAND_STR(a, type, size, position)
+
+#define floatx16 float16
+#define float16_TYPE float16
+
+#define CONVERTQ_DOWN_IMPL(in_type, out_type)                                                                                      \
+    inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position)                                            \
+    {                                                                                                                              \
+        return CONVERT(a * (1 << fixed_point_position) + select((in_type)-0.5, (in_type)0.5, isgreater(a, (in_type)0)), out_type); \
+    }
+
+CONVERTQ_DOWN_IMPL(float16, qs8x16)
+CONVERTQ_DOWN_IMPL(float16, qs16x16)
+
+#define CONVERTQ_DOWN_SAT_IMPL(in_type, out_type)                                                                                      \
+    inline out_type convert_##out_type##_##in_type##_sat(in_type a, int fixed_point_position)                                          \
+    {                                                                                                                                  \
+        return CONVERT_SAT(a * (1 << fixed_point_position) + select((in_type)-0.5, (in_type)0.5, isgreater(a, (in_type)0)), out_type); \
+    }
+
+CONVERTQ_DOWN_SAT_IMPL(float16, qs8x16)
+CONVERTQ_DOWN_SAT_IMPL(float16, qs16x16)
+
+#define CONVERTQ_UP_IMPL(in_type, out_type)                                             \
+    inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position) \
+    {                                                                                   \
+        return CONVERT(a, out_type) / (1 << fixed_point_position);                      \
+    }
+
+CONVERTQ_UP_IMPL(qs8x16, float16)
+CONVERTQ_UP_IMPL(qs16x16, float16)
+
+#define SQCVT_SAT_IMPL(type)                                                                    \
+    inline type sqcvt_##type##_sat(float a, int fixed_point_position)                           \
+    {                                                                                           \
+        return CONVERT_SAT((a * (1 << fixed_point_position) + ((a < 0) ? -0.5f : 0.5f)), type); \
+    }
+
+SQCVT_SAT_IMPL(qs8)
+SQCVT_SAT_IMPL(qs16)
+
+#define SQCVT_SAT_OP_EXPAND_STR(a, type, position) sqcvt_##type##_sat((a), (position))
+#define SQCVT_SAT_OP_EXPAND(a, type, position) SQCVT_SAT_OP_EXPAND_STR((a), type, position)
+
+#endif // ARM_COMPUTE_FIXED_POINT_H

diff --git a/src/core/CL/cl_kernels/floor.cl b/src/core/CL/cl_kernels/floor.cl
new file mode 100644
index 0000000..e967e6b
--- /dev/null
+++ b/src/core/CL/cl_kernels/floor.cl

@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Perform a floor operation on an input tensor.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Can only take floating point data types.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void floor_layer(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VSTORE(VEC_SIZE)
+    (floor(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr)), 0, (__global DATA_TYPE *)output.ptr);
+}

diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index caf6e3f..35a2e47 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl

@@ -23,55 +23,59 @@
  */
 #include "helpers.h"
 
+#ifdef FIXED_POINT_POSITION
+#include "fixed_point.h"
+#endif // FIXED_POINT_POSITION
+
 /** This OpenCL kernel computes the "vector" 1x4 transposition of input matrix
  *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U32/S32/F32
  * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: F32
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
  */
-__kernel void gemm_transpose1x4_f32(IMAGE_DECLARATION(src),
-                                    IMAGE_DECLARATION(dst))
+__kernel void gemm_transpose1x4(IMAGE_DECLARATION(src),
+                                IMAGE_DECLARATION(dst))
 {
     uint x = get_global_id(0);
     uint y = get_global_id(1);
 
-    /* Compute address for Matrix B - source */
+    // Compute address for Matrix B - source
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
 
-    /* Compute address for Matrix B transposed - destination. X and Y are swapped */
+    // Compute address for Matrix B transposed - destination. X and Y are swapped
     uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
 
-    float4 b0 = vload4(0, (__global float *)src.ptr);
+    uint4 b0 = vload4(0, (__global uint *)src.ptr);
 
-    vstore4(b0, 0, (__global float *)(dst_ptr + dst_addr_in_bytes));
+    vstore4(b0, 0, (__global uint *)(dst_ptr + dst_addr_in_bytes));
 }
 
 /** This OpenCL kernel computes the "vector" 1x8 transposition of input matrix
  *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U16/S16/QS16/F16
  * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: F16
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
  */
-__kernel void gemm_transpose1x8_f16(IMAGE_DECLARATION(src),
-                                    IMAGE_DECLARATION(dst))
+__kernel void gemm_transpose1x8(IMAGE_DECLARATION(src),
+                                IMAGE_DECLARATION(dst))
 {
     uint x = get_global_id(0);
     uint y = get_global_id(1);
@@ -82,28 +86,28 @@
     /* Compute address for Matrix B transposed - destination. X and Y are swapped */
     uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
 
-    half8 b0 = vload8(0, (__global half *)src.ptr);
+    ushort8 b0 = vload8(0, (__global ushort *)src.ptr);
 
-    vstore8(b0, 0, (__global half *)(dst_ptr + dst_addr_in_bytes));
+    vstore8(b0, 0, (__global ushort *)(dst_ptr + dst_addr_in_bytes));
 }
 
 /** This OpenCL kernel computes the "vector" 1x16 transposition of input matrix
  *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/QS8
  * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: U8
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
  */
-__kernel void gemm_transpose1x16_u8(IMAGE_DECLARATION(src),
-                                    IMAGE_DECLARATION(dst))
+__kernel void gemm_transpose1x16(IMAGE_DECLARATION(src),
+                                 IMAGE_DECLARATION(dst))
 {
     uint x = get_global_id(0);
     uint y = get_global_id(1);
@@ -127,7 +131,7 @@
  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: U32/S32/F32
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
@@ -142,33 +146,33 @@
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
     /* Load values from Matrix A */
-    float4 a0 = vload4(0, (__global float *)(offset(&src, 0, 0)));
-    float4 a1 = vload4(0, (__global float *)(offset(&src, 0, 1)));
-    float4 a2 = vload4(0, (__global float *)(offset(&src, 0, 2)));
-    float4 a3 = vload4(0, (__global float *)(offset(&src, 0, 3)));
+    uint4 a0 = vload4(0, (__global uint *)(offset(&src, 0, 0)));
+    uint4 a1 = vload4(0, (__global uint *)(offset(&src, 0, 1)));
+    uint4 a2 = vload4(0, (__global uint *)(offset(&src, 0, 2)));
+    uint4 a3 = vload4(0, (__global uint *)(offset(&src, 0, 3)));
 
-    float4 val0 = (float4)(a0.s0, a1.s0, a2.s0, a3.s0);
-    vstore4(val0, 0, ((__global float *)dst.ptr) + 0);
+    uint4 val0 = (uint4)(a0.s0, a1.s0, a2.s0, a3.s0);
+    vstore4(val0, 0, ((__global uint *)dst.ptr) + 0);
 
-    val0 = (float4)(a0.s1, a1.s1, a2.s1, a3.s1);
-    vstore4(val0, 0, ((__global float *)dst.ptr) + 4);
+    val0 = (uint4)(a0.s1, a1.s1, a2.s1, a3.s1);
+    vstore4(val0, 0, ((__global uint *)dst.ptr) + 4);
 
-    val0 = (float4)(a0.s2, a1.s2, a2.s2, a3.s2);
-    vstore4(val0, 0, ((__global float *)dst.ptr) + 8);
+    val0 = (uint4)(a0.s2, a1.s2, a2.s2, a3.s2);
+    vstore4(val0, 0, ((__global uint *)dst.ptr) + 8);
 
-    val0 = (float4)(a0.s3, a1.s3, a2.s3, a3.s3);
-    vstore4(val0, 0, ((__global float *)dst.ptr) + 12);
+    val0 = (uint4)(a0.s3, a1.s3, a2.s3, a3.s3);
+    vstore4(val0, 0, ((__global uint *)dst.ptr) + 12);
 }
 
 /** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
  *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U16/S16/F16
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U16/S16/QS16/F16
  * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: U16/S16/F16
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
@@ -183,33 +187,33 @@
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
     /* Load values from Matrix A */
-    half8 a0 = vload8(0, (__global half *)(offset(&src, 0, 0)));
-    half8 a1 = vload8(0, (__global half *)(offset(&src, 0, 1)));
-    half8 a2 = vload8(0, (__global half *)(offset(&src, 0, 2)));
-    half8 a3 = vload8(0, (__global half *)(offset(&src, 0, 3)));
+    ushort8 a0 = vload8(0, (__global ushort *)(offset(&src, 0, 0)));
+    ushort8 a1 = vload8(0, (__global ushort *)(offset(&src, 0, 1)));
+    ushort8 a2 = vload8(0, (__global ushort *)(offset(&src, 0, 2)));
+    ushort8 a3 = vload8(0, (__global ushort *)(offset(&src, 0, 3)));
 
-    half8 val0 = (half8)((half4)(a0.s0, a1.s0, a2.s0, a3.s0), (half4)(a0.s1, a1.s1, a2.s1, a3.s1));
-    vstore8(val0, 0, ((__global half *)dst.ptr) + 0);
+    ushort8 val0 = (ushort8)((ushort4)(a0.s0, a1.s0, a2.s0, a3.s0), (ushort4)(a0.s1, a1.s1, a2.s1, a3.s1));
+    vstore8(val0, 0, ((__global ushort *)dst.ptr) + 0);
 
-    val0 = (half8)((half4)(a0.s2, a1.s2, a2.s2, a3.s2), (half4)(a0.s3, a1.s3, a2.s3, a3.s3));
-    vstore8(val0, 0, ((__global half *)dst.ptr) + 8);
+    val0 = (ushort8)((ushort4)(a0.s2, a1.s2, a2.s2, a3.s2), (ushort4)(a0.s3, a1.s3, a2.s3, a3.s3));
+    vstore8(val0, 0, ((__global ushort *)dst.ptr) + 8);
 
-    val0 = (half8)((half4)(a0.s4, a1.s4, a2.s4, a3.s4), (half4)(a0.s5, a1.s5, a2.s5, a3.s5));
-    vstore8(val0, 0, ((__global half *)dst.ptr) + 16);
+    val0 = (ushort8)((ushort4)(a0.s4, a1.s4, a2.s4, a3.s4), (ushort4)(a0.s5, a1.s5, a2.s5, a3.s5));
+    vstore8(val0, 0, ((__global ushort *)dst.ptr) + 16);
 
-    val0 = (half8)((half4)(a0.s6, a1.s6, a2.s6, a3.s6), (half4)(a0.s7, a1.s7, a2.s7, a3.s7));
-    vstore8(val0, 0, ((__global half *)dst.ptr) + 24);
+    val0 = (ushort8)((ushort4)(a0.s6, a1.s6, a2.s6, a3.s6), (ushort4)(a0.s7, a1.s7, a2.s7, a3.s7));
+    vstore8(val0, 0, ((__global ushort *)dst.ptr) + 24);
 }
 
 /** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
  *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/QS8
  * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: U8/S8
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
@@ -248,65 +252,47 @@
 
 /** This kernel accumulates each row with the biases vector
  *
- * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: F32
+ * @note The data type must be passed at compile time -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: U8/S8/QS8/U16/S16/F16/U32/S32/F32
  * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X dimension (in bytes)
  * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
  * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
- * @param[in]      biases_ptr                           Pointer to the biases vector. Same as input.
+ * @param[in]      biases_ptr                           Pointer to the biases vector. Same as @p accum_ptr
  * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void gemm_accumulate_biases_f32(
+#ifdef DATA_TYPE
+__kernel void gemm_accumulate_biases(
     IMAGE_DECLARATION(accum),
     VECTOR_DECLARATION(biases))
 {
     Image  accum  = CONVERT_TO_IMAGE_STRUCT(accum);
     Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
 
-    float4 accum_value  = vload4(0, (__global float *)accum.ptr);
-    float4 biases_value = vload4(0, (__global float *)biases.ptr);
-    accum_value         = biases_value + accum_value;
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    accum_value = vload16(0, (__global DATA_TYPE *)accum.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    biases_value = vload16(0, (__global DATA_TYPE *)biases.ptr);
+#ifdef FIXED_POINT_POSITION
+    accum_value = ADD_SAT_OP_EXPAND(biases_value, accum_value, DATA_TYPE, 16);
+#else  // FIXED_POINT_POSITION
+    accum_value = biases_value + accum_value;
+#endif // FIXED_POINT_POSITION
 
     // Store result in the accummulate buffer
-    vstore4(accum_value, 0, (__global float *)accum.ptr);
+    vstore16(accum_value, 0, (__global DATA_TYPE *)accum.ptr);
 }
+#endif /* DATA_TYPE */
 
-/** This kernel accumulates each row with the biases vector
- *
- * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: F16
- * @param[in]      accum_stride_x                       Stride of the accumulate tensor in X dimension (in bytes)
- * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
- * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
- * @param[in]      biases_ptr                           Pointer to the biases vector. Same as input.
- * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void gemm_accumulate_biases_f16(
-    IMAGE_DECLARATION(accum),
-    VECTOR_DECLARATION(biases))
-{
-    Image  accum  = CONVERT_TO_IMAGE_STRUCT(accum);
-    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
-
-    half8 accum_value  = vload8(0, (__global half *)accum.ptr);
-    half8 biases_value = vload8(0, (__global half *)biases.ptr);
-    accum_value        = biases_value + accum_value;
-
-    // Store result in the accummulate buffer
-    vstore8(accum_value, 0, (__global half *)accum.ptr);
-}
-
-#if(defined WIDTH_MATRIX_B)
+#ifdef COLS_B
 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_u8 and @ref gemm_transpose1x16_u8 before running the matrix multiplication
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication
  *
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DCOLS_B
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported formats: U8
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -314,13 +300,13 @@
  * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported formats: U8
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported formats: same as @p src0_ptr
  * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported formats: U8
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported formats: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
@@ -332,14 +318,14 @@
  * @param[in]  c_mult_int                         Multiplied with each element of the matrix C.
  * @param[in]  shift                              Number of bits to shift right the result.
  */
-__kernel void gemm_mm_u8(IMAGE_DECLARATION(src0),
-                         IMAGE_DECLARATION(src1),
-                         IMAGE_DECLARATION(dst),
-                         int a_offset,
-                         int b_offset,
-                         int c_offset,
-                         int c_mult_int,
-                         int shift)
+__kernel void gemm_mm_interleaved_transposed_u8(IMAGE_DECLARATION(src0),
+                                                IMAGE_DECLARATION(src1),
+                                                IMAGE_DECLARATION(dst),
+                                                int a_offset,
+                                                int b_offset,
+                                                int c_offset,
+                                                int c_mult_int,
+                                                int shift)
 {
     /* src_addr.s0 = address of matrix A */
     /* src_addr.s1 = address of matrix B */
@@ -352,7 +338,7 @@
     src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
 
     /* Compute end row address for matrix B */
-    int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
+    int end_row_mtx_b = src_addr.s1 + COLS_B;
 
     /* Reset accumulators */
     int16 c00 = 0.0f;
@@ -406,13 +392,13 @@
     vstore16(convert_uchar16_sat(c20), 0, (__global uchar *)(offset(&dst, 0, 2)));
     vstore16(convert_uchar16_sat(c30), 0, (__global uchar *)(offset(&dst, 0, 3)));
 }
-#endif
+#endif /* COLS_B */
 
-#if(defined WIDTH_MATRIX_B && defined ALPHA)
+#if defined(COLS_B) && defined(ALPHA)
 /** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
  *
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -420,22 +406,22 @@
  * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
  * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F32
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
  */
-__kernel void gemm_mm_f32_midgard(IMAGE_DECLARATION(src0),
-                                  IMAGE_DECLARATION(src1),
-                                  IMAGE_DECLARATION(dst))
+__kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0),
+                                                         IMAGE_DECLARATION(src1),
+                                                         IMAGE_DECLARATION(dst))
 {
     /* src_addr.s0 = address of matrix A */
     /* src_addr.s1 = address of matrix B */
@@ -451,7 +437,7 @@
     src_addr = src_addr >> 2;
 
     /* Compute end row address for matrix B */
-    int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
+    int end_row_mtx_b = src_addr.s1 + COLS_B;
 
     /* Reset accumulators */
     float4 c00 = 0.0f;
@@ -509,9 +495,9 @@
 }
 
 /** This OpenCL kernel is optimised for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
  *
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -519,22 +505,22 @@
  * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
  * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F32
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
  */
-__kernel void gemm_mm_f32_bifrost(IMAGE_DECLARATION(src0),
-                                  IMAGE_DECLARATION(src1),
-                                  IMAGE_DECLARATION(dst))
+__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
+                                                         IMAGE_DECLARATION(src1),
+                                                         IMAGE_DECLARATION(dst))
 {
     // src_addr_a = address of matrix A
     // src_addr_b = address of matrix B
@@ -542,7 +528,7 @@
     __global float *src_addr_b = (__global float *)(src1_ptr + get_global_id(0) * src1_stride_y + src1_offset_first_element_in_bytes);
 
     // Compute end row address for matrix B
-    __global float *src_end_addr_b = src_addr_b + WIDTH_MATRIX_B;
+    __global float *src_end_addr_b = src_addr_b + COLS_B;
 
     // Reset accumulators
     float c00 = 0.0f;
@@ -719,9 +705,9 @@
 }
 
 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f16 and @ref gemm_transpose1x8_f16 before running the matrix multiplication
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
  *
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -729,22 +715,22 @@
  * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
  * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F16
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
  */
-__kernel void gemm_mm_f16(IMAGE_DECLARATION(src0),
-                          IMAGE_DECLARATION(src1),
-                          IMAGE_DECLARATION(dst))
+__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
+                                                 IMAGE_DECLARATION(src1),
+                                                 IMAGE_DECLARATION(dst))
 {
     /* src_addr.s0 = address of matrix A */
     /* src_addr.s1 = address of matrix B */
@@ -760,7 +746,7 @@
     src_addr = src_addr >> 1;
 
     /* Compute end row address for matrix B */
-    int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
+    int end_row_mtx_b = src_addr.s1 + COLS_B;
 
     /* Reset accumulators */
     half8 c00 = 0.0f;
@@ -768,7 +754,7 @@
     half8 c20 = 0.0f;
     half8 c30 = 0.0f;
 
-    for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 16))
+    for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(8, 16))
     {
         /* Load values from matrix A (interleaved) and matrix B (transposed) */
         half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
@@ -817,146 +803,634 @@
     vstore8(c30, 0, (__global half *)(offset(&dst, 0, 3)));
 }
 
-#if(defined WIDTH_VECTOR_A)
-/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)
+#ifdef FIXED_POINT_POSITION
+/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 8 bit fixed point precision
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication
  *
- * @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA
+ * @attention The width of matrix B, the alpha's value and fixed point position need to be passed at compile time using -DCOLS_B -DALPHA and -DFIXED_POINT_POSITION
  *
- * @attention The input vector A and matrix B must not be reshaped
+ * @note: ALPHA must be passed in 8 bit fixed point format
  *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: QS8
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
  * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F32
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
  */
-__kernel void gemm_vm_f32(IMAGE_DECLARATION(src0),
-                          IMAGE_DECLARATION(src1),
-                          IMAGE_DECLARATION(dst))
+__kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),
+                                                 IMAGE_DECLARATION(src1),
+                                                 IMAGE_DECLARATION(dst))
 {
-    int idx = get_global_id(0) * 4;
+    /* src_addr.s0 = address of matrix A */
+    /* src_addr.s1 = address of matrix B */
 
-    /* Compute the address for the vector A and matrix B */
-    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-    src_addr.s1 += idx * sizeof(float);
+    /* Compute address for matrix A and B */
+    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
+                                                                        (src1_stride_y));
 
-    int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
+    /* Add offset_first_element_in_bytes */
+    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
 
-    float4 acc = 0.0f;
+    /* Compute end row address for matrix B */
+    int end_row_mtx_b = src_addr.s1 + COLS_B;
 
-    for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
+    /* Reset accumulators */
+    short8 c00 = 0.0f;
+    short8 c10 = 0.0f;
+    short8 c20 = 0.0f;
+    short8 c30 = 0.0f;
+    short8 c01 = 0.0f;
+    short8 c11 = 0.0f;
+    short8 c21 = 0.0f;
+    short8 c31 = 0.0f;
+
+    /* This for loop performs 1 accumulation for each iteration */
+    for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(4, 16))
     {
-        float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
-        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-        float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        char4  a0 = vload4(0, ((__global char *)src0_ptr) + src_addr.s0);
+        char16 b0 = vload16(0, ((__global char *)src1_ptr) + src_addr.s1);
 
-        acc += b0 * (float4)a0.s0;
-        acc += b1 * (float4)a0.s1;
-    }
+        c00 = mlal_sat_qs8x8(c00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);
+        c10 = mlal_sat_qs8x8(c10, (char8)a0.s1, b0.s01234567, FIXED_POINT_POSITION);
+        c20 = mlal_sat_qs8x8(c20, (char8)a0.s2, b0.s01234567, FIXED_POINT_POSITION);
+        c30 = mlal_sat_qs8x8(c30, (char8)a0.s3, b0.s01234567, FIXED_POINT_POSITION);
 
-    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
-    {
-        float  a0 = *((__global float *)(src0_ptr + src_addr.s0));
-        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-
-        acc += b0 * (float4)a0;
+        c01 = mlal_sat_qs8x8(c01, (char8)a0.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
+        c11 = mlal_sat_qs8x8(c11, (char8)a0.s1, b0.s89ABCDEF, FIXED_POINT_POSITION);
+        c21 = mlal_sat_qs8x8(c21, (char8)a0.s2, b0.s89ABCDEF, FIXED_POINT_POSITION);
+        c31 = mlal_sat_qs8x8(c31, (char8)a0.s3, b0.s89ABCDEF, FIXED_POINT_POSITION);
     }
 
     /* Compute destination address */
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
-    /* Multiply by the weight of vector-matrix product */
-    acc = acc * (float4)ALPHA;
+    /* Multiply by the weight of matrix product */
+    char16 c00_qs8 = convert_char16_sat((short16)(c00, c01));
+    char16 c10_qs8 = convert_char16_sat((short16)(c10, c11));
+    char16 c20_qs8 = convert_char16_sat((short16)(c20, c21));
+    char16 c30_qs8 = convert_char16_sat((short16)(c30, c31));
 
-    vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
+    c00_qs8 = mul_sat_qs8x16(c00_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
+    c10_qs8 = mul_sat_qs8x16(c10_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
+    c20_qs8 = mul_sat_qs8x16(c20_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
+    c30_qs8 = mul_sat_qs8x16(c30_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
+
+    /* Store 16x4 block */
+    vstore16(c00_qs8, 0, (__global char *)(offset(&dst, 0, 0)));
+    vstore16(c10_qs8, 0, (__global char *)(offset(&dst, 0, 1)));
+    vstore16(c20_qs8, 0, (__global char *)(offset(&dst, 0, 2)));
+    vstore16(c30_qs8, 0, (__global char *)(offset(&dst, 0, 3)));
 }
 
-/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)
+/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 16 bit fixed point precision
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
  *
- * @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA
+ * @attention The width of matrix B, the alpha's value and fixed point position need to be passed at compile time using -DCOLS_B -DALPHA and -DFIXED_POINT_POSITION
  *
- * @attention The input vector A and matrix B must not be reshaped
+ * @note: ALPHA must be passed in 16 bit fixed point format
  *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: QS16
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
  * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F16
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
  */
-__kernel void gemm_vm_f16(IMAGE_DECLARATION(src0),
-                          IMAGE_DECLARATION(src1),
-                          IMAGE_DECLARATION(dst))
+__kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),
+                                                  IMAGE_DECLARATION(src1),
+                                                  IMAGE_DECLARATION(dst))
 {
-    int idx = get_global_id(0) * 8;
+    /* src_addr.s0 = address of matrix A */
+    /* src_addr.s1 = address of matrix B */
 
-    /* Compute the address for the vector A and matrix B */
-    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-    src_addr.s1 += idx * sizeof(half);
+    /* Compute address for matrix A and B */
+    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
+                                                                        (src1_stride_y));
 
-    int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(half));
+    /* Add offset_first_element_in_bytes */
+    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
 
-    half8 acc = 0.0f;
+    /* Divide by 2 in order to get the src_addr in unit of short */
+    src_addr = src_addr >> 1;
 
-    for(; src_addr.s0 <= (end_row_vec_a - 4 * sizeof(half)); src_addr += (int2)(4 * sizeof(half), 4 * src1_stride_y))
+    /* Compute end row address for matrix B */
+    int end_row_mtx_b = src_addr.s1 + COLS_B;
+
+    /* Reset accumulators */
+    int8 c00 = 0.0f;
+    int8 c10 = 0.0f;
+    int8 c20 = 0.0f;
+    int8 c30 = 0.0f;
+
+    /* This for loop performs 1 accumulation for each iteration */
+    for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(4, 8))
     {
-        half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0));
-        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
-        half8 b1 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
-        half8 b2 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 2 * src1_stride_y));
-        half8 b3 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 3 * src1_stride_y));
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        short4 a0 = vload4(0, ((__global short *)src0_ptr) + src_addr.s0);
+        short8 b0 = vload8(0, ((__global short *)src1_ptr) + src_addr.s1);
 
-        acc += b0 * (half8)a0.s0;
-        acc += b1 * (half8)a0.s1;
-        acc += b2 * (half8)a0.s2;
-        acc += b3 * (half8)a0.s3;
-    }
-
-    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(half), src1_stride_y))
-    {
-        half a0  = *((__global half *)(src0_ptr + src_addr.s0));
-        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
-
-        acc += b0 * (half8)a0;
+        c00 = mlal_sat_qs16x8(c00, (short8)a0.s0, b0, FIXED_POINT_POSITION);
+        c10 = mlal_sat_qs16x8(c10, (short8)a0.s1, b0, FIXED_POINT_POSITION);
+        c20 = mlal_sat_qs16x8(c20, (short8)a0.s2, b0, FIXED_POINT_POSITION);
+        c30 = mlal_sat_qs16x8(c30, (short8)a0.s3, b0, FIXED_POINT_POSITION);
     }
 
     /* Compute destination address */
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
-    /* Multiply by the weight of vector-matrix product */
-    acc = acc * (half8)ALPHA;
+    /* Multiply by the weight of matrix product */
+    short8 c00_qs16 = convert_short8_sat(c00);
+    short8 c10_qs16 = convert_short8_sat(c10);
+    short8 c20_qs16 = convert_short8_sat(c20);
+    short8 c30_qs16 = convert_short8_sat(c30);
 
-    vstore8(acc, 0, (__global half *)(offset(&dst, 0, 0)));
+    c00_qs16 = mul_sat_qs16x8(c00_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
+    c10_qs16 = mul_sat_qs16x8(c10_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
+    c20_qs16 = mul_sat_qs16x8(c20_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
+    c30_qs16 = mul_sat_qs16x8(c30_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
+
+    /* Store 8x4 block */
+    vstore8(c00_qs16, 0, (__global short *)(offset(&dst, 0, 0)));
+    vstore8(c10_qs16, 0, (__global short *)(offset(&dst, 0, 1)));
+    vstore8(c20_qs16, 0, (__global short *)(offset(&dst, 0, 2)));
+    vstore8(c30_qs16, 0, (__global short *)(offset(&dst, 0, 3)));
 }
-#endif /* (defined WIDTH_VECTOR_A) */
-#endif /* (defined WIDTH_MATRIX_B && defined ALPHA) */
+#endif // defined(FIXED_POINT_POSITION)
 
-#if(defined BETA)
+#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+#if defined(DATA_TYPE)
+#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @note This OpenCL kernel works with floating point data types (F16/F32)
+ * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
+ * @note The width of matrix A and the alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16/F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
+                                     IMAGE_DECLARATION(src1),
+                                     IMAGE_DECLARATION(dst))
+{
+    int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+    // Compute starting address for matrix A and Matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for the matrix A
+    src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+    // Update address for the matrix B
+    src_addr.s1 += idx * sizeof(DATA_TYPE);
+
+    int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));
+
+    VECTOR_TYPE acc0 = 0.0f;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    VECTOR_TYPE acc1 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    VECTOR_TYPE acc2 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    VECTOR_TYPE acc3 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))
+    {
+        // Load values from matrix A
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        // Load values from matrix B
+        VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
+        VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+        // Accumulate
+        acc0 += b0 * (VECTOR_TYPE)a0.s0;
+        acc0 += b1 * (VECTOR_TYPE)a0.s1;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += b0 * (VECTOR_TYPE)a1.s0;
+        acc1 += b1 * (VECTOR_TYPE)a1.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += b0 * (VECTOR_TYPE)a2.s0;
+        acc2 += b1 * (VECTOR_TYPE)a2.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += b0 * (VECTOR_TYPE)a3.s0;
+        acc3 += b1 * (VECTOR_TYPE)a3.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))
+    {
+        // Load values from matrix A
+        DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        // Load values from matrix B
+        VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
+
+        // Accumulate
+        acc0 += b0 * (VECTOR_TYPE)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += b0 * (VECTOR_TYPE)a1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += b0 * (VECTOR_TYPE)a2;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += b0 * (VECTOR_TYPE)a3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Multiply by the weight of matrix-matrix product and store the result
+    acc0 = acc0 * (VECTOR_TYPE)ALPHA;
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (acc0, 0, (__global DATA_TYPE *)(offset(&dst, 0, 0)));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = acc1 * (VECTOR_TYPE)ALPHA;
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (acc1, 0, (__global DATA_TYPE *)(offset(&dst, 0, 1)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = acc2 * (VECTOR_TYPE)ALPHA;
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (acc2, 0, (__global DATA_TYPE *)(offset(&dst, 0, 2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = acc3 * (VECTOR_TYPE)ALPHA;
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (acc3, 0, (__global DATA_TYPE *)(offset(&dst, 0, 3)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+#endif // defined(DATA_TYPE)
+
+#ifdef FIXED_POINT_POSITION
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @note This OpenCL kernel works with fixed point data types QS8
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
+ * @note The width of matrix A, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA
+ * @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION
+ * @note The alpha value must be passed in 8 bit fixed point format using -DALPHA
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: QS8/QS16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_mm_qs8(IMAGE_DECLARATION(src0),
+                          IMAGE_DECLARATION(src1),
+                          IMAGE_DECLARATION(dst))
+{
+    int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+    // Compute starting address for matrix A and Matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for the matrix A
+    src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+    // Update address for the matrix B
+    src_addr.s1 += idx * sizeof(char);
+
+    int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(char));
+
+    short8 acc00 = 0;
+    short8 acc01 = 0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    short8 acc10 = 0;
+    short8 acc11 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    short8 acc20 = 0;
+    short8 acc21 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    short8 acc30 = 0;
+    short8 acc31 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    // This for loop performs 4 accumulations per iteration
+    for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))
+    {
+        char2 a0 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        char2 a1 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        char2 a2 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        char2 a3 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        char16 b0 = vload16(0, (__global char *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
+        char16 b1 = vload16(0, (__global char *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
+
+        acc00 = mlal_sat_qs8x8(acc00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);
+        acc00 = mlal_sat_qs8x8(acc00, (char8)a0.s1, b1.s01234567, FIXED_POINT_POSITION);
+        acc01 = mlal_sat_qs8x8(acc01, (char8)a0.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
+        acc01 = mlal_sat_qs8x8(acc01, (char8)a0.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc10 = mlal_sat_qs8x8(acc10, (char8)a1.s0, b0.s01234567, FIXED_POINT_POSITION);
+        acc10 = mlal_sat_qs8x8(acc10, (char8)a1.s1, b1.s01234567, FIXED_POINT_POSITION);
+        acc11 = mlal_sat_qs8x8(acc11, (char8)a1.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
+        acc11 = mlal_sat_qs8x8(acc11, (char8)a1.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc20 = mlal_sat_qs8x8(acc20, (char8)a2.s0, b0.s01234567, FIXED_POINT_POSITION);
+        acc20 = mlal_sat_qs8x8(acc20, (char8)a2.s1, b1.s01234567, FIXED_POINT_POSITION);
+        acc21 = mlal_sat_qs8x8(acc21, (char8)a2.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
+        acc21 = mlal_sat_qs8x8(acc21, (char8)a2.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc30 = mlal_sat_qs8x8(acc30, (char8)a3.s0, b0.s01234567, FIXED_POINT_POSITION);
+        acc30 = mlal_sat_qs8x8(acc30, (char8)a3.s1, b1.s01234567, FIXED_POINT_POSITION);
+        acc31 = mlal_sat_qs8x8(acc31, (char8)a3.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
+        acc31 = mlal_sat_qs8x8(acc31, (char8)a3.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    // Left-over accumulations
+    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
+    {
+        char a0 = *((__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        char a1 = *((__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        char a2 = *((__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        char a3 = *((__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        char16 b0 = vload16(0, (__global char *)(src1_ptr + src_addr.s1));
+
+        acc00 = mlal_sat_qs8x8(acc00, (char8)a0, b0.s01234567, FIXED_POINT_POSITION);
+        acc01 = mlal_sat_qs8x8(acc01, (char8)a0, b0.s89ABCDEF, FIXED_POINT_POSITION);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc10 = mlal_sat_qs8x8(acc10, (char8)a1, b0.s01234567, FIXED_POINT_POSITION);
+        acc11 = mlal_sat_qs8x8(acc11, (char8)a1, b0.s89ABCDEF, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc20 = mlal_sat_qs8x8(acc20, (char8)a2, b0.s01234567, FIXED_POINT_POSITION);
+        acc21 = mlal_sat_qs8x8(acc21, (char8)a2, b0.s89ABCDEF, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc30 = mlal_sat_qs8x8(acc30, (char8)a3, b0.s01234567, FIXED_POINT_POSITION);
+        acc31 = mlal_sat_qs8x8(acc31, (char8)a3, b0.s89ABCDEF, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Multiply by the weight of matrix product and store the result
+    char16 acc_qs8;
+    acc_qs8 = convert_char16_sat((short16)(acc00, acc01));
+    acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
+    vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 0)));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc_qs8 = convert_char16_sat((short16)(acc10, acc11));
+    acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
+    vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 1)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc_qs8 = convert_char16_sat((short16)(acc20, acc21));
+    acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
+    vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc_qs8 = convert_char16_sat((short16)(acc30, acc31));
+    acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
+    vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 3)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @note This OpenCL kernel works with fixed point data types QS16
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
+ * @note The width of matrix A, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA
+ * @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION
+ * @note The alpha value must be passed in 16 bit fixed point format using -DALPHA
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: QS8/QS16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_mm_qs16(IMAGE_DECLARATION(src0),
+                           IMAGE_DECLARATION(src1),
+                           IMAGE_DECLARATION(dst))
+{
+    int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+    // Compute starting address for matrix A and Matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for the matrix A
+    src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+    // Update address for the matrix B
+    src_addr.s1 += idx * sizeof(short);
+
+    int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(short));
+
+    int8 acc0 = 0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    int8 acc1 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    int8 acc2 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    int8 acc3 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    // This for loop performs 4 accumulations per iteration
+    for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(short)); src_addr += (int2)(2 * sizeof(short), 2 * src1_stride_y))
+    {
+        short2 a0 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        short2 a1 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        short2 a2 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        short2 a3 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        short8 b0 = vload8(0, (__global short *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
+        short8 b1 = vload8(0, (__global short *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
+
+        acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s0, b0, FIXED_POINT_POSITION);
+        acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s1, b1, FIXED_POINT_POSITION);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 = mlal_sat_qs16x8(acc1, (short8)a1.s0, b0, FIXED_POINT_POSITION);
+        acc1 = mlal_sat_qs16x8(acc1, (short8)a1.s1, b1, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 = mlal_sat_qs16x8(acc2, (short8)a2.s0, b0, FIXED_POINT_POSITION);
+        acc2 = mlal_sat_qs16x8(acc2, (short8)a2.s1, b1, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 = mlal_sat_qs16x8(acc3, (short8)a3.s0, b0, FIXED_POINT_POSITION);
+        acc3 = mlal_sat_qs16x8(acc3, (short8)a3.s1, b1, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    // Left-over accumulations
+    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(short), src1_stride_y))
+    {
+        short a0 = *((__global short *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        short a1 = *((__global short *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        short a2 = *((__global short *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        short a3 = *((__global short *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        short8 b0 = vload8(0, (__global short *)(src1_ptr + src_addr.s1));
+
+        acc0 = mlal_sat_qs16x8(acc0, (short8)a0, b0, FIXED_POINT_POSITION);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 = mlal_sat_qs16x8(acc1, (short8)a1, b0, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 = mlal_sat_qs16x8(acc2, (short8)a2, b0, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 = mlal_sat_qs16x8(acc3, (short8)a3, b0, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Multiply by the weight of matrix product and store the result
+    short8 acc_qs16;
+    acc_qs16 = convert_short8_sat(acc0);
+    acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
+    vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 0)));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc_qs16 = convert_short8_sat(acc1);
+    acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
+    vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 1)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc_qs16 = convert_short8_sat(acc2);
+    acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
+    vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc_qs16 = convert_short8_sat(acc3);
+    acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
+    vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 3)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+#endif // defined(FIXED_POINT_POSITION)
+#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+#endif // defined(COLS_B) && defined(ALPHA)
+
+#ifdef BETA
 /** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
  *
  * @attention The beta's value need to be passed at compile time using -DBETA
@@ -967,7 +1441,7 @@
  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: F32
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
@@ -996,13 +1470,15 @@
 
 /** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
  *
+ * @attention The beta's value need to be passed at compile time using -DBETA
+ *
  * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F16
  * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: F16
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
@@ -1028,9 +1504,89 @@
     /* Store final result in axb matrix */
     vstore8(out, 0, (__global half *)dst.ptr);
 }
-#endif /* (defined BETA) */
 
-#if(defined WIDTH_VECTOR_A)
+#ifdef FIXED_POINT_POSITION
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices in 8 bit fixed point taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @attention The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION
+ *
+ * @note: BETA must be passed in 8 bit fixed point format
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: QS8
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_ma_qs8(IMAGE_DECLARATION(src),
+                          IMAGE_DECLARATION(dst))
+{
+    /* Compute source and destination addresses */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load values from A x B */
+    char16 alpha_ab = vload16(0, (__global char *)dst.ptr);
+
+    /* Load values from Matrix C */
+    char16 c = vload16(0, (__global char *)src.ptr);
+
+    /* Computes alpha * axb + beta * c */
+    char16 out = mla_sat_qs8x16(alpha_ab, (char16)BETA, c, FIXED_POINT_POSITION);
+
+    /* Store final result in axb matrix */
+    vstore16(out, 0, (__global char *)dst.ptr);
+}
+
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices in 16 bit fixed point taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @attention The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION
+ *
+ * @note: BETA must be passed in 16 bit fixed point format
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: QS16
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_ma_qs16(IMAGE_DECLARATION(src),
+                           IMAGE_DECLARATION(dst))
+{
+    /* Compute source and destination addresses */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load values from A x B */
+    short8 alpha_ab = vload8(0, (__global short *)dst.ptr);
+
+    /* Load values from Matrix C */
+    short8 c = vload8(0, (__global short *)src.ptr);
+
+    /* Computes alpha * axb + beta * c */
+    short8 out = mla_sat_qs16x8(alpha_ab, (short8)BETA, c, FIXED_POINT_POSITION);
+
+    /* Store final result in axb matrix */
+    vstore8(out, 0, (__global short *)dst.ptr);
+}
+#endif /* defined(FIXED_POINT_POSITION) */
+#endif /* defined(BETA) */
+
+#ifdef WIDTH_VECTOR_A
 /** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer
  *
  * @attention The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
@@ -1043,7 +1599,7 @@
  * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
  * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
@@ -1051,7 +1607,7 @@
  * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
  * @param[in]  src1_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F32
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
@@ -1096,4 +1652,4 @@
 
     vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
 }
-#endif /* (defined WIDTH_VECTOR_A) */
+#endif /* WIDTH_VECTOR_A */
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/gemv.cl b/src/core/CL/cl_kernels/gemv.cl
new file mode 100644
index 0000000..76128f7
--- /dev/null
+++ b/src/core/CL/cl_kernels/gemv.cl

@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This kernel applies dot product to each plane on the input tensor and the corrispective column of the reshaped weight tensor.
+ *
+ * @note Datatype and source width and height should be given as a preprocessor argument using -DDATA_TYPE=type, -DSRC_WIDTH=width and -DSRC_HEIGHT=height. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ */
+__kernel void gemm_mv(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(weights), VECTOR_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+    int y = get_global_id(1) * 4;
+    int z = get_global_id(2);
+
+    __global uchar *current_weights = weights_ptr + weights_offset_first_element_in_bytes + z * weights_stride_y;
+    __global uchar *input_ptr       = src.ptr;
+
+    DATA_TYPE acc0 = (DATA_TYPE)0;
+    DATA_TYPE acc1 = (DATA_TYPE)0;
+    DATA_TYPE acc2 = (DATA_TYPE)0;
+    DATA_TYPE acc3 = (DATA_TYPE)0;
+
+    // This kernel handle 4 rows in per thread so that it can reuse the weights
+    for(int i = 0; i < SRC_WIDTH; i += 4)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        weights = vload4(0, (__global DATA_TYPE *)(current_weights + i * weights_stride_x));
+
+        int4 offset = (int4)i * (int4)src_stride_x + (int4)(0, 1, 2, 3) * (int4)src_stride_y;
+
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        tmp0 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        tmp1 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        tmp2 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        tmp3 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s3));
+
+        acc0 += dot(weights, tmp0);
+        acc1 += dot(weights, tmp1);
+        acc2 += dot(weights, tmp2);
+        acc3 += dot(weights, tmp3);
+    }
+
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y + z * SRC_HEIGHT) * dst_stride_x;
+
+    int rows_left = SRC_HEIGHT - (y + 4);
+
+    // This if check is used to handle the last few rows when it can't be divided by the four
+    if(rows_left >= 0)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        out = (VEC_DATA_TYPE(DATA_TYPE, 4))(acc0, acc1, acc2, acc3);
+        vstore4(out, 0, (__global DATA_TYPE *)output_ptr);
+    }
+    else
+    {
+        switch(rows_left)
+        {
+            case -1: // three rows left; one is padding
+                *((__global DATA_TYPE *)(output_ptr + 2 * dst_stride_x)) = acc2;
+            case -2: // two rows left; two are padding
+                *((__global DATA_TYPE *)(output_ptr + 1 * dst_stride_x)) = acc1;
+            case -3: // one row left; three are padding
+                *((__global DATA_TYPE *)(output_ptr + 0 * dst_stride_x)) = acc0;
+                break;
+        }
+    }
+}

diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 6db8ed5..68af64e 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h

@@ -26,8 +26,16 @@
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
+#define EXPAND(x) x
+
 #define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
 
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
 #define VEC_DATA_TYPE_STR(type, size) type##size
 #define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
 
@@ -64,6 +72,18 @@
     uint        name##_step_z,   \
     uint        name##_offset_first_element_in_bytes
 
+#define TENSOR4D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
 #define CONVERT_TO_VECTOR_STRUCT(name) \
     update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
 
@@ -76,6 +96,15 @@
 #define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
     update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
 
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
 #define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
     update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
                                  name##_stride_z, name##_step_z)
@@ -83,6 +112,13 @@
 #define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
     update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
 
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_z, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
 /** Structure to hold Vector information */
 typedef struct Vector
 {
@@ -110,6 +146,17 @@
     int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
 } Tensor3D;
 
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+    int             stride_w;                      /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
 /** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
  *
  * @param[in] ptr                           Pointer to the starting postion of the buffer
@@ -155,6 +202,32 @@
     return img;
 }
 
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+Image inline update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
 /** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
  *
  * @param[in] ptr                           Pointer to the starting postion of the buffer
@@ -182,6 +255,24 @@
     return tensor;
 }
 
+Tensor4D inline update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+                                             uint step_w,
+                                             uint mod_size)
+{
+    Tensor4D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z,
+        .stride_w                      = stride_w
+    };
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    return tensor;
+}
+
 /** Get the pointer position of a Vector
  *
  * @param[in] vec Pointer to the starting position of the buffer
@@ -205,7 +296,7 @@
 
 /** Get the pointer position of a Tensor3D
  *
- * @param[in] tensor Pointer to the starting postion of the buffer
+ * @param[in] tensor Pointer to the starting position of the buffer
  * @param[in] x      Relative X position
  * @param[in] y      Relative Y position
  * @param[in] z      Relative Z position
@@ -215,4 +306,17 @@
     return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
 }
 
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ * @param[in] w      Relative W position
+ */
+__global inline const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
 #endif // _HELPER_H

diff --git a/src/core/CL/cl_kernels/hog.cl b/src/core/CL/cl_kernels/hog.cl
index 31dd57b..5d3a607 100644
--- a/src/core/CL/cl_kernels/hog.cl
+++ b/src/core/CL/cl_kernels/hog.cl

@@ -24,7 +24,7 @@
 #include "helpers.h"
 #include "types.h"
 
-#if(defined CELL_WIDTH && defined CELL_HEIGHT && defined NUM_BINS && defined PHASE_SCALE)
+#if defined(CELL_WIDTH) && defined(CELL_HEIGHT) && defined(NUM_BINS) && defined(PHASE_SCALE)
 
 /** This OpenCL kernel computes the HOG orientation binning
  *
@@ -159,21 +159,21 @@
         ((__global float *)dst.ptr)[xc] = bins[xc];
     }
 }
-#endif // (defined CELL_WIDTH && defined CELL_HEIGHT && defined NUM_BINS && defined PHASE_SCALE)
+#endif /* CELL_WIDTH and CELL_HEIGHT and NUM_BINS and PHASE_SCALE */
 
-#if(defined NUM_CELLS_PER_BLOCK_HEIGHT && defined NUM_BINS_PER_BLOCK_X && defined NUM_BINS_PER_BLOCK && HOG_NORM_TYPE && defined L2_HYST_THRESHOLD)
+#if defined(NUM_CELLS_PER_BLOCK_HEIGHT) && defined(NUM_BINS_PER_BLOCK_X) && defined(NUM_BINS_PER_BLOCK) && defined(HOG_NORM_TYPE) && defined(L2_HYST_THRESHOLD)
 
 #ifndef L2_NORM
 #error The value of enum class HOGNormType::L2_NORM has not be passed to the OpenCL kernel
-#endif
+#endif /* not L2_NORM */
 
 #ifndef L2HYS_NORM
 #error The value of enum class HOGNormType::L2HYS_NORM has not be passed to the OpenCL kernel
-#endif
+#endif /* not L2HYS_NORM */
 
 #ifndef L1_NORM
 #error The value of enum class HOGNormType::L1_NORM has not be passed to the OpenCL kernel
-#endif
+#endif /* not L1_NORM */
 
 /** This OpenCL kernel computes the HOG block normalization
  *
@@ -231,13 +231,13 @@
             sum_f32 += val1 * val1;
             sum_f32 += val2 * val2;
             sum_f32 += val3 * val3;
-#else
+#else  /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
             // Compute |val| for L1_NORM
             sum_f32 += fabs(val0);
             sum_f32 += fabs(val1);
             sum_f32 += fabs(val2);
             sum_f32 += fabs(val3);
-#endif // (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+#endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
 
             // Store linearly the input values un-normalized in the output image. These values will be reused for the normalization.
             // This approach will help us to be cache friendly in the next for loop where the normalization will be done because all the values
@@ -255,9 +255,9 @@
 
 #if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
             sum += val * val;
-#else
+#else  /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
             sum += fabs(val);
-#endif // (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+#endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
 
             ((__global float *)dst.ptr)[xc + 0 + yc * NUM_BINS_PER_BLOCK_X] = val;
         }
@@ -322,7 +322,7 @@
     // We use the same constants of OpenCV
     scale = 1.0f / (sqrt(sum) + 1e-3f);
 
-#endif // (HOG_NORM_TYPE == L2HYS_NORM)
+#endif /* (HOG_NORM_TYPE == L2HYS_NORM) */
 
     int i = 0;
     for(; i <= (NUM_BINS_PER_BLOCK - 16); i += 16)
@@ -349,9 +349,9 @@
         ((__global float *)dst.ptr)[i] *= scale;
     }
 }
-#endif // (defined NUM_CELLS_PER_BLOCK_HEIGHT && defined NUM_BINS_PER_BLOCK_X && defined NUM_BINS_PER_BLOCK && HOG_NORM_TYPE && defined L2_HYST_THRESHOLD)
+#endif /* NUM_CELLS_PER_BLOCK_HEIGHT and NUM_BINS_PER_BLOCK_X and NUM_BINS_PER_BLOCK and HOG_NORM_TYPE and L2_HYST_THRESHOLD */
 
-#if(defined NUM_BLOCKS_PER_DESCRIPTOR_Y && defined NUM_BINS_PER_DESCRIPTOR_X && defined THRESHOLD && defined MAX_NUM_DETECTION_WINDOWS && defined IDX_CLASS && defined BLOCK_STRIDE_WIDTH && defined BLOCK_STRIDE_HEIGHT && defined DETECTION_WINDOW_WIDTH && defined DETECTION_WINDOW_HEIGHT)
+#if defined(NUM_BLOCKS_PER_DESCRIPTOR_Y) && defined(NUM_BINS_PER_DESCRIPTOR_X) && defined(THRESHOLD) && defined(MAX_NUM_DETECTION_WINDOWS) && defined(IDX_CLASS) && defined(BLOCK_STRIDE_WIDTH) && defined(BLOCK_STRIDE_HEIGHT) && defined(DETECTION_WINDOW_WIDTH) && defined(DETECTION_WINDOW_HEIGHT)
 
 /** This OpenCL kernel computes the HOG detector using linear SVM
  *
@@ -452,4 +452,5 @@
         }
     }
 }
-#endif // defined BIAS && defined NUM_BLOCKS_PER_DESCRIPTOR_Y && defined NUM_BINS_PER_DESCRIPTOR_X && ...
+#endif /* NUM_BLOCKS_PER_DESCRIPTOR_Y && NUM_BINS_PER_DESCRIPTOR_X && THRESHOLD && MAX_NUM_DETECTION_WINDOWS && IDX_CLASS &&
+        * BLOCK_STRIDE_WIDTH && BLOCK_STRIDE_HEIGHT && DETECTION_WINDOW_WIDTH && DETECTION_WINDOW_HEIGHT */

diff --git a/src/core/CL/cl_kernels/l2_normalize.cl b/src/core/CL/cl_kernels/l2_normalize.cl
new file mode 100644
index 0000000..8d47631
--- /dev/null
+++ b/src/core/CL/cl_kernels/l2_normalize.cl

@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This kernel performs reduction given an operation.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                           Pointer to the source tensor. Supported data types: QS8/F16/F32
+ * @param[in]  sum_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  epsilon                           Epsilon value
+ */
+__kernel void l2_normalize(
+    VECTOR_DECLARATION(src),
+    VECTOR_DECLARATION(sum),
+    VECTOR_DECLARATION(dst),
+    DATA_TYPE epsilon)
+{
+    Vector src = CONVERT_TO_VECTOR_STRUCT(src);
+    Vector sum = CONVERT_TO_VECTOR_STRUCT(sum);
+    Vector dst = CONVERT_TO_VECTOR_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = vload16(0, (__global DATA_TYPE *)src.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))native_rsqrt(fmax(((__global DATA_TYPE *)sum.ptr)[0], epsilon));
+
+    vstore16(in * normalize_value, 0, (__global DATA_TYPE *)dst.ptr);
+}
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/magnitude_phase.cl b/src/core/CL/cl_kernels/magnitude_phase.cl
index c4b0df8..e9845e0 100644
--- a/src/core/CL/cl_kernels/magnitude_phase.cl
+++ b/src/core/CL/cl_kernels/magnitude_phase.cl

@@ -81,17 +81,17 @@
 #define MAGNITUDE_OP(x, y) magnitude_l1((x), (y))
 #elif(2 == MAGNITUDE)
 #define MAGNITUDE_OP(x, y) magnitude_l2(convert_int16(x), convert_int16(y))
-#else
+#else /* MAGNITUDE */
 #define MAGNITUDE_OP(x, y)
-#endif
+#endif /* MAGNITUDE */
 
 #if(1 == PHASE)
 #define PHASE_OP(x, y) phase_unsigned((x), (y))
 #elif(2 == PHASE)
 #define PHASE_OP(x, y) phase_signed((x), (y))
-#else
+#else /* PHASE */
 #define PHASE_OP(x, y)
-#endif
+#endif /* PHASE */
 
 /** Calculate the magnitude and phase of given the gradients of an image.
  *
@@ -133,11 +133,11 @@
 #ifdef MAGNITUDE
     ,
     IMAGE_DECLARATION(magnitude)
-#endif
+#endif /* MAGNITUDE */
 #ifdef PHASE
     ,
     IMAGE_DECLARATION(phase)
-#endif
+#endif /* PHASE */
 )
 {
     // Get pixels pointer
@@ -154,9 +154,9 @@
 #ifdef MAGNITUDE
     Image magnitude = CONVERT_TO_IMAGE_STRUCT(magnitude);
     vstore16(MAGNITUDE_OP(in_a, in_b), 0, (__global DATA_TYPE *)magnitude.ptr);
-#endif
+#endif /* MAGNITUDE */
 #ifdef PHASE
     Image phase = CONVERT_TO_IMAGE_STRUCT(phase);
     vstore16(PHASE_OP(in_a, in_b), 0, phase.ptr);
-#endif
+#endif /* PHASE */
 }

diff --git a/src/core/CL/cl_kernels/mean_stddev.cl b/src/core/CL/cl_kernels/mean_stddev.cl
index 50b8312..7c29d2f 100644
--- a/src/core/CL/cl_kernels/mean_stddev.cl
+++ b/src/core/CL/cl_kernels/mean_stddev.cl

@@ -44,19 +44,19 @@
     IMAGE_DECLARATION(src),
     uint     height,
     __global ulong *global_sum
-#if defined         STDDEV
+#ifdef STDDEV
     ,
     __global ulong *global_sum_sq
-#endif
+#endif /* STDDEV */
 )
 {
     // Get pixels pointer
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
 
-    uint8   tmp_sum = 0;
-#if defined STDDEV
-    uint8   tmp_sum_sq = 0;
-#endif
+    uint8 tmp_sum = 0;
+#ifdef STDDEV
+    uint8 tmp_sum_sq = 0;
+#endif /* STDDEV */
     // Calculate partial sum
     for(int i = 0; i < height; i++)
     {
@@ -64,20 +64,20 @@
         uint8 data = convert_uint8(vload8(0, offset(&src, 0, i)));
 
         tmp_sum += data;
-#if defined STDDEV
+#ifdef STDDEV
         tmp_sum_sq += data * data;
-#endif
+#endif /* STDDEV */
     }
     // Perform reduction
     tmp_sum.s0123 += tmp_sum.s4567;
     tmp_sum.s01 += tmp_sum.s23;
     atom_add(global_sum, tmp_sum.s0 + tmp_sum.s1);
 
-#if defined STDDEV
+#ifdef STDDEV
     tmp_sum_sq.s0123 += tmp_sum_sq.s4567;
     tmp_sum_sq.s01 += tmp_sum_sq.s23;
     atom_add(global_sum_sq, tmp_sum_sq.s0 + tmp_sum_sq.s1);
-#endif
+#endif /* STDDEV */
 }
 
 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : disable

diff --git a/src/core/CL/cl_kernels/minmax_layer.cl b/src/core/CL/cl_kernels/minmax_layer.cl
new file mode 100644
index 0000000..1e543b4
--- /dev/null
+++ b/src/core/CL/cl_kernels/minmax_layer.cl

@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(WIDTH) && defined(HEIGHT) && defined(DEPTH)
+/** This function identifies the min and maximum value of an input 3D tensor.
+ *
+ * @note The width, height and depth of the input tensor must be provided at compile time using -DWIDTH, -DHEIGHT and -DDEPTH (e.g. -DWIDTH=320, -DHEIGHT=240, -DDEPTH=3)
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z                      Stride of the source image in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] dst_ptr                           Pointer to the min/max vector. Minimum value in position 0, maximum value in position 1. Supported data types: F32.
+ * @param[in] dst_stride_x                      Stride of the min/max vector in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the min/max vector
+ */
+__kernel void minmax_layer(
+    TENSOR3D_DECLARATION(src),
+    VECTOR_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Vector   dst = CONVERT_TO_VECTOR_STRUCT(dst);
+
+    float4 min_value     = (float4)FLT_MAX;
+    float4 max_value     = (float4) - FLT_MAX;
+    float2 min_max_value = (float2)(FLT_MAX, -FLT_MAX);
+
+    for(int z = 0; z < DEPTH; ++z)
+    {
+        for(int y = 0; y < HEIGHT; ++y)
+        {
+            int             x        = 0;
+            __global float *src_addr = (__global float *)(src.ptr + y * src_stride_y + z * src_stride_z);
+
+            for(; x <= (int)(WIDTH - 8); x += 8)
+            {
+                float8 value = *(src_addr + x);
+
+                min_value = select(value.s0123, min_value, min_value < value.s0123);
+                min_value = select(value.s4567, min_value, min_value < value.s4567);
+
+                max_value = select(value.s0123, max_value, max_value > value.s0123);
+                max_value = select(value.s4567, max_value, max_value > value.s4567);
+            }
+
+            for(; x < WIDTH; ++x)
+            {
+                float value = *(src_addr + x);
+
+                min_max_value.s0 = min(min_max_value.s0, value);
+                min_max_value.s1 = max(min_max_value.s1, value);
+            }
+        }
+    }
+
+    // Perform min/max reduction
+    min_value.s01 = min(min_value.s01, min_value.s23);
+    min_value.s0  = min(min_value.s0, min_value.s1);
+    max_value.s01 = max(max_value.s01, max_value.s23);
+    max_value.s0  = max(max_value.s0, max_value.s1);
+
+    min_max_value.s0 = min(min_max_value.s0, min_value.s0);
+    min_max_value.s1 = max(min_max_value.s1, max_value.s0);
+
+    if(min_max_value.s0 == min_max_value.s1)
+    {
+        min_max_value.s0 = 0.0f;
+        min_max_value.s1 = 1.0f;
+    }
+
+    // Store min and max
+    vstore2(min_max_value, 0, (__global float *)dst.ptr);
+}
+#endif // defined(WIDTH) && defined(HEIGHT) && defined(DEPTH)
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/minmaxloc.cl b/src/core/CL/cl_kernels/minmaxloc.cl
index 799b1e8..0f557a4 100644
--- a/src/core/CL/cl_kernels/minmaxloc.cl
+++ b/src/core/CL/cl_kernels/minmaxloc.cl

@@ -26,15 +26,26 @@
 
 #ifndef DATA_TYPE_MIN
 #define DATA_TYPE_MIN 0x0
-#endif
+#endif /* DATA_TYPE_MIN */
 
 #ifndef DATA_TYPE_MAX
 #define DATA_TYPE_MAX 0xFF
-#endif
+#endif /* DATA_TYPE_MAX */
+
+inline int FloatFlip(float val)
+{
+    union
+    {
+        int   int_val;
+        float flt_val;
+    } u_val;
+    u_val.flt_val = val;
+    return (u_val.int_val >= 0) ? u_val.int_val : u_val.int_val ^ 0x7FFFFFFF;
+}
 
 __constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MIN);
 __constant VEC_DATA_TYPE(DATA_TYPE, 16) type_max = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MAX);
-__constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+__constant int16 idx16 = (int16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 
 /** This function identifies the min and maximum value of an input image.
  *
@@ -54,7 +65,7 @@
 __kernel void minmax(
     IMAGE_DECLARATION(src),
     __global int *min_max,
-    uint          width)
+    int           width)
 {
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
 
@@ -65,11 +76,11 @@
     local_max = type_min;
 
     // Calculate min/max of row
-    uint width4 = width >> 4;
-    for(uint i = 0; i < width4; i++)
+    int i = 0;
+    for(; i + 16 <= width; i += 16)
     {
         VEC_DATA_TYPE(DATA_TYPE, 16)
-        data      = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0));
+        data      = vload16(0, (__global DATA_TYPE *)offset(&src, i, 0));
         local_min = min(data, local_min);
         local_max = max(data, local_max);
     }
@@ -77,12 +88,16 @@
 #ifdef NON_MULTIPLE_OF_16
     // Handle non multiple of 16
     VEC_DATA_TYPE(DATA_TYPE, 16)
-    data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0));
+    data = vload16(0, (__global DATA_TYPE *)offset(&src, i, 0));
+#ifdef IS_DATA_TYPE_FLOAT
+    int16 valid_indices = (i + idx16) < width;
+#else  /* IS_DATA_TYPE_FLOAT */
     VEC_DATA_TYPE(DATA_TYPE, 16)
-    widx      = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(DATA_TYPE, 16));
-    local_max = max(local_max, select(type_min, data, widx));
-    local_min = min(local_min, select(type_max, data, widx));
-#endif
+    valid_indices = CONVERT((i + idx16) < width, VEC_DATA_TYPE(DATA_TYPE, 16));
+#endif /* IS_DATA_TYPE_FLOAT */
+    local_max = max(local_max, select(type_min, data, valid_indices));
+    local_min = min(local_min, select(type_max, data, valid_indices));
+#endif /* NON_MULTIPLE_OF_16 */
 
     // Perform min/max reduction
     local_min.s01234567 = min(local_min.s01234567, local_min.s89ABCDEF);
@@ -98,8 +113,13 @@
     local_max.s0 = max(local_max.s0, local_max.s1);
 
     // Update global min/max
+#ifdef IS_DATA_TYPE_FLOAT
+    atomic_min(&min_max[0], FloatFlip(local_min.s0));
+    atomic_max(&min_max[1], FloatFlip(local_max.s0));
+#else  /* IS_DATA_TYPE_FLOAT */
     atomic_min(&min_max[0], local_min.s0);
     atomic_max(&min_max[1], local_max.s0);
+#endif /* IS_DATA_TYPE_FLOAT */
 }
 
 /** This function counts the min and max occurrences in an image and tags their position.
@@ -124,41 +144,50 @@
     IMAGE_DECLARATION(src),
     __global int *min_max,
     __global uint *min_max_count
-#if defined        LOCATE_MIN
+#ifdef LOCATE_MIN
     ,
     __global Coordinates2D *min_loc, uint max_min_loc_count
-#endif
-#if defined LOCATE_MAX
+#endif /* LOCATE_MIN */
+#ifdef LOCATE_MAX
     ,
     __global Coordinates2D *max_loc, uint max_max_loc_count
-#endif
+#endif /* LOCATE_MAX */
 )
 {
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
 
+#ifdef IS_DATA_TYPE_FLOAT
+    __global float *min_max_ptr = (__global float *)min_max;
+    float           min_value   = min_max_ptr[0];
+    float           max_value   = min_max_ptr[1];
+#else  /* IS_DATA_TYPE_FLOAT */
+    int min_value = min_max[0];
+    int max_value = min_max[1];
+#endif /* IS_DATA_TYPE_FLOAT */
+
     DATA_TYPE value = *((__global DATA_TYPE *)src.ptr);
-#if defined COUNT_MIN_MAX
-    if(value == min_max[0])
+#ifdef COUNT_MIN_MAX
+    if(value == min_value)
     {
         uint idx = atomic_inc(&min_max_count[0]);
-#if defined  LOCATE_MIN
+#ifdef LOCATE_MIN
         if(idx < max_min_loc_count)
         {
             min_loc[idx].x = get_global_id(0);
             min_loc[idx].y = get_global_id(1);
         }
-#endif
+#endif /* LOCATE_MIN */
     }
-    if(value == min_max[1])
+    if(value == max_value)
     {
         uint idx = atomic_inc(&min_max_count[1]);
-#if defined  LOCATE_MAX
+#ifdef LOCATE_MAX
         if(idx < max_max_loc_count)
         {
             max_loc[idx].x = get_global_id(0);
             max_loc[idx].y = get_global_id(1);
         }
-#endif
+#endif /* LOCATE_MAX */
     }
-#endif
+#endif /* COUNT_MIN_MAX */
 }

diff --git a/src/core/CL/cl_kernels/non_linear_filter3x3.cl b/src/core/CL/cl_kernels/non_linear_filter3x3.cl
index f860c96..19118ea 100644
--- a/src/core/CL/cl_kernels/non_linear_filter3x3.cl
+++ b/src/core/CL/cl_kernels/non_linear_filter3x3.cl

@@ -54,13 +54,13 @@
     uchar16 bottom = vload16(0, offset(&src, -1, 1));
 
     // Apply respective filter
-#if defined   MIN
-    uchar16   tmp = min(top, min(middle, bottom));
-    uchar8    out = row_reduce_min_3(tmp);
-#elif defined MAX
+#ifdef MIN
+    uchar16 tmp = min(top, min(middle, bottom));
+    uchar8  out = row_reduce_min_3(tmp);
+#elif defined(MAX)
     uchar16 tmp = max(top, max(middle, bottom));
     uchar8  out = row_reduce_max_3(tmp);
-#elif defined MEDIAN
+#elif defined(MEDIAN)
     uchar8 p0  = top.s01234567;
     uchar8 p1  = top.s12345678;
     uchar8 p2  = top.s23456789;
@@ -71,9 +71,9 @@
     uchar8 p7  = bottom.s12345678;
     uchar8 p8  = bottom.s23456789;
     uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
-#else
+#else /* MIN or MAX or MEDIAN */
 #error "Unsupported filter function"
-#endif
+#endif /* MIN or MAX or MEDIAN */
 
     // Store result
     vstore8(out, 0, dst.ptr);
@@ -109,22 +109,22 @@
     uchar8  bottom = vload8(0, offset(&src, 0, 1));
 
     // Apply respective filter
-#if defined   MIN
-    uchar8    tmp_middle = row_reduce_min_3(middle);
-    uchar8    out        = min(tmp_middle, min(top, bottom));
-#elif defined MAX
+#ifdef MIN
+    uchar8 tmp_middle = row_reduce_min_3(middle);
+    uchar8 out        = min(tmp_middle, min(top, bottom));
+#elif defined(MAX)
     uchar8  tmp_middle = row_reduce_max_3(middle);
     uchar8  out        = max(tmp_middle, max(top, bottom));
-#elif defined MEDIAN
+#elif defined(MEDIAN)
     uchar8 p0  = top.s01234567;
     uchar8 p1  = middle.s01234567;
     uchar8 p2  = middle.s12345678;
     uchar8 p3  = middle.s23456789;
     uchar8 p4  = bottom.s01234567;
     uchar8 out = sort5(p0, p1, p2, p3, p4);
-#else
+#else /* MIN or MAX or MEDIAN */
 #error "Unsupported filter function"
-#endif
+#endif /* MIN or MAX or MEDIAN */
 
     // Store result
     vstore8(out, 0, dst.ptr);
@@ -160,13 +160,13 @@
     uchar16 bottom = vload16(0, offset(&src, -1, 1));
 
     // Apply respective filter
-#if defined   MIN
-    uchar16   tmp = min(top, min(middle, bottom));
-    uchar8    out = row_reduce_min_3(tmp);
-#elif defined MAX
+#ifdef MIN
+    uchar16 tmp = min(top, min(middle, bottom));
+    uchar8  out = row_reduce_min_3(tmp);
+#elif defined(MAX)
     uchar16 tmp        = max(top, max(middle, bottom));
     uchar8  out        = row_reduce_max_3(tmp);
-#elif defined MEDIAN
+#elif defined(MEDIAN)
     uchar8 p0  = top.s01234567;
     uchar8 p1  = top.s12345678;
     uchar8 p2  = top.s23456789;
@@ -177,9 +177,9 @@
     uchar8 p7  = bottom.s12345678;
     uchar8 p8  = bottom.s23456789;
     uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
-#else
+#else /* MIN or MAX or MEDIAN */
 #error "Unsupported filter function"
-#endif
+#endif /* MIN or MAX or MEDIAN */
 
     // Store result
     vstore8(out, 0, dst.ptr);

diff --git a/src/core/CL/cl_kernels/non_linear_filter5x5.cl b/src/core/CL/cl_kernels/non_linear_filter5x5.cl
index d9ae95f..d3b2958 100644
--- a/src/core/CL/cl_kernels/non_linear_filter5x5.cl
+++ b/src/core/CL/cl_kernels/non_linear_filter5x5.cl

@@ -351,17 +351,17 @@
     uchar16 bottom2 = vload16(0, offset(&src, -2, 2));
 
     // Apply respective filter
-#if defined   MIN
-    uchar16   tmp = min(middle, min(min(top2, top), min(bottom, bottom2)));
-    uchar8    out = row_reduce_min_5(tmp);
-#elif defined MAX
+#ifdef MIN
+    uchar16 tmp = min(middle, min(min(top2, top), min(bottom, bottom2)));
+    uchar8  out = row_reduce_min_5(tmp);
+#elif defined(MAX)
     uchar16 tmp = max(middle, max(max(top2, top), max(bottom, bottom2)));
     uchar8  out = row_reduce_max_5(tmp);
-#elif defined MEDIAN
+#elif defined(MEDIAN)
     uchar8 out = median_box5x5(top2, top, middle, bottom, bottom2);
-#else
+#else /* MIN or MAX or MEDIAN */
 #error "Unsupported filter function"
-#endif
+#endif /* MIN or MAX or MEDIAN */
 
     // Store result
     vstore8(out, 0, dst.ptr);
@@ -392,33 +392,33 @@
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
     // Load values
-    uchar16 top2    = vload16(0, offset(&src, 0, -2));
-    uchar16 top     = vload16(0, offset(&src, 0, -1));
+    uchar8  top2    = vload8(0, offset(&src, 0, -2));
+    uchar8  top     = vload8(0, offset(&src, 0, -1));
     uchar16 middle  = vload16(0, offset(&src, -2, 0));
-    uchar16 bottom  = vload16(0, offset(&src, 0, 1));
-    uchar16 bottom2 = vload16(0, offset(&src, 0, 2));
+    uchar8  bottom  = vload8(0, offset(&src, 0, 1));
+    uchar8  bottom2 = vload8(0, offset(&src, 0, 2));
 
     // Apply respective filter
-#if defined   MIN
-    uchar8    tmp_middle = row_reduce_min_5(middle);
-    uchar8    out        = min(tmp_middle, min(min(top2.s01234567, top.s01234567), min(bottom.s01234567, bottom2.s01234567)));
-#elif defined MAX
+#ifdef MIN
+    uchar8 tmp_middle = row_reduce_min_5(middle);
+    uchar8 out        = min(tmp_middle, min(min(top2, top), min(bottom, bottom2)));
+#elif defined(MAX)
     uchar8  tmp_middle = row_reduce_max_5(middle);
-    uchar8  out        = max(tmp_middle, max(max(top2.s01234567, top.s01234567), max(bottom.s01234567, bottom2.s01234567)));
-#elif defined MEDIAN
-    uchar8 p0  = top2.s01234567;
-    uchar8 p1  = top.s01234567;
+    uchar8  out        = max(tmp_middle, max(max(top2, top.s01234567), max(bottom, bottom2)));
+#elif defined(MEDIAN)
+    uchar8 p0  = top2;
+    uchar8 p1  = top;
     uchar8 p2  = middle.s01234567;
     uchar8 p3  = middle.s12345678;
     uchar8 p4  = middle.s23456789;
     uchar8 p5  = middle.s3456789A;
     uchar8 p6  = middle.s456789AB;
-    uchar8 p7  = bottom.s01234567;
-    uchar8 p8  = bottom2.s01234567;
+    uchar8 p7  = bottom;
+    uchar8 p8  = bottom2;
     uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
-#else
+#else /* MIN or MAX or MEDIAN */
 #error "Unsupported filter function"
-#endif
+#endif /* MIN or MAX or MEDIAN */
 
     // Store result
     vstore8(out, 0, dst.ptr);
@@ -449,30 +449,34 @@
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
     // Load values
-    uchar16 top2    = vload16(0, offset(&src, -1, -2));
+    uchar16 top2    = vload16(0, offset(&src, -2, -2));
     uchar16 top     = vload16(0, offset(&src, -2, -1));
     uchar16 middle  = vload16(0, offset(&src, -2, 0));
     uchar16 bottom  = vload16(0, offset(&src, -2, 1));
-    uchar16 bottom2 = vload16(0, offset(&src, -1, 2));
+    uchar16 bottom2 = vload16(0, offset(&src, -2, 2));
+
+    // Shift top2 and bottom2 values
+    top2    = top2.s123456789ABCDEFF;
+    bottom2 = bottom2.s123456789ABCDEFF;
 
     // Apply respective filter
-#if defined   MIN
-    uchar16   tmp_3     = min(top2, bottom2);
-    uchar16   tmp_5     = min(middle, min(top, bottom));
-    uchar8    tmp_3_red = row_reduce_min_3(tmp_3);
-    uchar8    tmp_5_red = row_reduce_min_5(tmp_5);
-    uchar8    out       = min(tmp_3_red, tmp_5_red);
-#elif defined MAX
+#ifdef MIN
+    uchar16 tmp_3     = min(top2, bottom2);
+    uchar16 tmp_5     = min(middle, min(top, bottom));
+    uchar8  tmp_3_red = row_reduce_min_3(tmp_3);
+    uchar8  tmp_5_red = row_reduce_min_5(tmp_5);
+    uchar8  out       = min(tmp_3_red, tmp_5_red);
+#elif defined(MAX)
     uchar16 tmp_3      = max(top2, bottom2);
     uchar16 tmp_5      = max(middle, max(top, bottom));
     uchar8  tmp_3_red  = row_reduce_max_3(tmp_3);
     uchar8  tmp_5_red  = row_reduce_max_5(tmp_5);
     uchar8  out        = max(tmp_3_red, tmp_5_red);
-#elif defined MEDIAN
+#elif defined(MEDIAN)
     uchar8 out = median_disk5x5(top2, top, middle, bottom, bottom2);
-#else
+#else /* MIN or MAX or MEDIAN */
 #error "Unsupported filter function"
-#endif
+#endif /* MIN or MAX or MEDIAN */
 
     // Store result
     vstore8(out, 0, dst.ptr);

diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/normalization_layer.cl
index 076b0d8..4e65560 100644
--- a/src/core/CL/cl_kernels/normalization_layer.cl
+++ b/src/core/CL/cl_kernels/normalization_layer.cl

@@ -23,132 +23,152 @@
  */
 #include "helpers.h"
 
+#if defined(FIXED_POINT_POSITION)
+
+#include "fixed_point.h"
+#define MUL_OP(x, y) MUL_SAT_OP_EXPAND((x), (y), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+#define ADD_OP(x, y) ADD_SAT_OP_EXPAND((x), (y), DATA_TYPE, VEC_SIZE)
+#define DIV_OP(x, y) DIV_SAT_OP_VEC_EXPAND((x), (y), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+#define EXP_OP(x) EXP_OP_EXPAND((x), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+#define LOG_OP(x) LOG_OP_EXPAND((x), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+#define POW_OP(x, y) EXP_OP(MUL_OP(LOG_OP((x)), (y)))
+#define SQCVT_SAT(a) SQCVT_SAT_OP_EXPAND((a), DATA_TYPE, FIXED_POINT_POSITION)
+
+#define LOAD_OP(offset, ptr) vload16(offset, ptr)
+#define STORE_OP(data, offset, ptr) vstore16(data, offset, ptr)
+
+#else // FIXED_POINT_POSITION
+
+#define MUL_OP(x, y) ((x) * (y))
+#define ADD_OP(x, y) ((x) + (y))
+#define DIV_OP(x, y) ((x) / (y))
+#define POW_OP(x, y) pow((x), (y))
+#define SQCVT_SAT(a) (a)
+
+#define LOAD_OP(offset, ptr) vload4(offset, ptr)
+#define STORE_OP(data, offset, ptr) vstore4(data, offset, ptr)
+
+#endif // FIXED_POINT_POSITION
+
 /** Apply cross map normalization.
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
+ * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
  *
- * @param[in]  input_ptr                                   Pointer to the first source tensor. Supported data types: F16, F32
- * @param[in]  input_stride_x                              Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                                input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                              Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                                input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                              Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                                input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes         The offset of the first element in the first source tensor
- * @param[in]  squared_input_ptr                           Pointer to the second source tensor. Supported data types: F16, F32
- * @param[in]  squared_input_stride_x                      Stride of the second source tensor in X dimension (in bytes)
- * @param[in]  squared_input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  squared_input_stride_y                      Stride of the second source tensor in Y dimension (in bytes)
- * @param[in]  squared_input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  squared_input_stride_z                      Stride of the second source tensor in Z dimension (in bytes)
- * @param[in]  squared_input_step_z                        input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  squared_input_offset_first_element_in_bytes The offset of the second element in the second source tensor
- * @param[out] output_ptr                                  Pointer to the destination tensor. Supported data types: F16, F32
- * @param[in]  output_stride_x                             Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                               output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                             Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                               output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                             Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                               output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes        The offset of the first element in the destination tensor
- * @param[in]  coeff                                       Alpha parameter / norm_size
- * @param[in]  beta                                        Beta parameter in the normalization equation
- * @param[in]  kappa                                       Kappa parameter in the normalization equation
- * @param[in]  radius                                      Number of elements on the right or left side to normalize across
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void normalization_layer_cross_map(TENSOR3D_DECLARATION(input),
-                                            TENSOR3D_DECLARATION(squared_input),
-                                            TENSOR3D_DECLARATION(output),
-                                            float coeff,
-                                            float beta,
-                                            float kappa,
-                                            uint  radius)
+                                            TENSOR3D_DECLARATION(output))
 {
-    Tensor3D in         = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D squared_in = CONVERT_TO_TENSOR3D_STRUCT(squared_input);
-    Tensor3D out        = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
 
-    DATA_TYPE acc = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0;
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
 
-    const int num_of_slices = get_global_size(2);
     const int current_slice = get_global_id(2);
 
-    const int left_slice  = max(current_slice - (int)radius, (int)0);
-    const int right_slice = min(current_slice + (int)radius, (int)(num_of_slices - 1));
+    const int left_slice  = max(current_slice - (int)RADIUS, (int)0);
+    const int right_slice = min(current_slice + (int)RADIUS, (int)(NUM_SLICES - 1));
 
     for(int i = left_slice; i <= right_slice; i++)
     {
-        acc += *(__global DATA_TYPE *)tensor3D_offset(&squared_in, 0, 0, i - current_slice);
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, 0, i - current_slice));
+        acc    = ADD_OP(acc, MUL_OP(values, values));
     }
 
-    const float normalized = pow(kappa + coeff * (float)acc, beta);
+    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized = POW_OP(acc, beta_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized_pixel = DIV_OP(LOAD_OP(0, (__global DATA_TYPE *)in.ptr), normalized);
 
-    const float normalized_pixel = (float) * ((__global DATA_TYPE *)in.ptr) / normalized;
-
-    *(__global DATA_TYPE *)out.ptr = CONVERT(normalized_pixel, DATA_TYPE);
+    STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
 }
 
 /** Apply in map normalization.
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
  *
- * @param[in]  input_ptr                                   Pointer to the first source tensor. Supported data types: F16, F32
- * @param[in]  input_stride_x                              Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                                input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                              Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                                input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                              Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                                input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes         The offset of the first element in the first source tensor
- * @param[in]  squared_input_ptr                           Pointer to the second source tensor. Supported data types: F16, F32
- * @param[in]  squared_input_stride_x                      Stride of the second source tensor in X dimension (in bytes)
- * @param[in]  squared_input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  squared_input_stride_y                      Stride of the second source tensor in Y dimension (in bytes)
- * @param[in]  squared_input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  squared_input_stride_z                      Stride of the second source tensor in Z dimension (in bytes)
- * @param[in]  squared_input_step_z                        input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  squared_input_offset_first_element_in_bytes The offset of the second element in the second source tensor
- * @param[out] output_ptr                                  Pointer to the destination tensor. Supported data types: F16, F32
- * @param[in]  output_stride_x                             Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                               output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                             Stride of the first destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                               output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                             Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                               output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes        The offset of the first element in the destination tensor
- * @param[in]  coeff                                       Alpha parameter / norm_size
- * @param[in]  beta                                        Beta parameter in the normalization equation
- * @param[in]  kappa                                       Kappa parameter in the normalization equation
- * @param[in]  radius                                      Number of elements on the right or left side to normalize across
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: QS8/F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the first destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void normalization_layer_in_map_1D(TENSOR3D_DECLARATION(input),
-                                            TENSOR3D_DECLARATION(squared_input),
-                                            TENSOR3D_DECLARATION(output),
-                                            float coeff,
-                                            float beta,
-                                            float kappa,
-                                            uint  radius)
+                                            TENSOR3D_DECLARATION(output))
 {
-    Tensor3D in         = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D squared_in = CONVERT_TO_TENSOR3D_STRUCT(squared_input);
-    Tensor3D out        = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
 
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    acc_vec = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0;
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
 
     const int current_pos = get_global_id(0) << 2;
 
-    const int left_pos  = max(current_pos - (int)radius, -3);
-    const int right_pos = min(current_pos + (int)radius, (int)((get_global_size(0) << 2) + 3 - 1));
+    const int left_pos  = max(current_pos - (int)RADIUS, -3);
+    const int right_pos = min(current_pos + (int)RADIUS, (int)((get_global_size(0) << 2) + 3 - 1));
 
     for(int i = left_pos; i <= right_pos; i += 1)
     {
-        acc_vec += vload4(0, (__global DATA_TYPE *)tensor3D_offset(&squared_in, i - current_pos, 0, 0));
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, i - current_pos, 0, 0));
+        acc    = ADD_OP(acc, MUL_OP(values, values));
     }
 
-    const float4 normalized = pow((float4)kappa + coeff * (float4)acc_vec, beta);
+    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized = POW_OP(acc, beta_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized_pixel = DIV_OP(LOAD_OP(0, (__global DATA_TYPE *)in.ptr), normalized);
 
-    const float4 normalized_pixel = CONVERT(vload4(0, (__global DATA_TYPE *)in.ptr), float4) / normalized;
-
-    vstore4(CONVERT(normalized_pixel, VEC_DATA_TYPE(DATA_TYPE, 4)), 0, (__global DATA_TYPE *)out.ptr);
+    STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
 }

diff --git a/src/core/CL/cl_kernels/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
index ae2031f..f4f36a0 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_float.cl
+++ b/src/core/CL/cl_kernels/pixelwise_mul_float.cl

@@ -25,9 +25,9 @@
 
 #ifdef SATURATE
 #define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
-#else
+#else /* SATURATE */
 #define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
-#endif
+#endif /* SATURATE */
 #define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
 
 /** Performs a pixelwise multiplication with float scale of either integer or float inputs.
@@ -43,31 +43,37 @@
  * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
  * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
  * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
  * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
  * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
  * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16, F16, F32
  * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
  * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
  * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
  * @param[in]  scale                             Float scaling factor. Supported data types: F32
  */
 __kernel void pixelwise_mul_float(
-    IMAGE_DECLARATION(in1),
-    IMAGE_DECLARATION(in2),
-    IMAGE_DECLARATION(out),
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out),
     const float scale)
 {
     // Get pixels pointer
-    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
-    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
 
     // Load data
     VEC_DATA_TYPE(DATA_TYPE_RES, 16)
@@ -76,13 +82,13 @@
     in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
 
     // Perform multiplication
-#if defined DATA_TYPE_FLOAT
+#ifdef DATA_TYPE_FLOAT
     VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    res = CONVERT(in1_data * in2_data * scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
-#else
+    res = CONVERT(in1_data * in2_data * (DATA_TYPE_RES)scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+#else  /* DATA_TYPE_FLOAT */
     VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
     res = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((convert_float16(in1_data * in2_data) * scale), VEC_DATA_TYPE(DATA_TYPE_RES, 16), ROUND), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), ROUND);
-#endif
+#endif /* DATA_TYPE_FLOAT */
 
     // Store result
     vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr);

diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
index 05c437c..b5734a3 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_int.cl
+++ b/src/core/CL/cl_kernels/pixelwise_mul_int.cl

@@ -23,12 +23,28 @@
  */
 #include "helpers.h"
 
-#ifdef SATURATE
-#define CONVERT_OP_INT_STR(x, type) (convert_##type##_sat(x))
-#else
-#define CONVERT_OP_INT_STR(x, type) (convert_##type(x))
-#endif
-#define CONVERT_OP_INT(x, type) CONVERT_OP_INT_STR(x, type)
+#if defined(FIXED_POINT_POSITION)
+
+#include "fixed_point.h"
+
+#if defined(SATURATE)
+#define MUL_OP(x, y, scale, type, size) MUL_SAT_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
+#else // SATURATE
+#define MUL_OP(x, y, scale, type, size) MUL_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
+#endif // SATURATE
+
+#else // FIXED_POINT_POSITION
+
+#if defined(SATURATE)
+#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x))
+#else // SATURATE
+#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size(x))
+#endif // SATURATE
+#define CONVERT_OP_INT(x, type, size) CONVERT_OP_INT_STR(x, type, size)
+
+#define MUL_OP(x, y, scale, type, size) CONVERT_OP_INT((x) * (y) >> scale, type, size)
+
+#endif // FIXED_POINT_POSITION
 
 /** Performs a pixelwise multiplication with integer scale of integer inputs.
  *
@@ -36,37 +52,44 @@
  * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
  * @attention The data_type of the intermediate result of the multiplication should passed as well using -DDATA_TYPE_RES.
  * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
+ * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3
  *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8/QS8/QS16/S16
  * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
  * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
  * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
  * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
  * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
  * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
  * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  scale                             Integer scaling factor. Supported data types: S32
+ * @param[in]  scale                             Integer scaling factor. Supported data types: S32 (ignored for QS8 and QS16 as the assumption is scale = 1).
  */
 __kernel void pixelwise_mul_int(
-    IMAGE_DECLARATION(in1),
-    IMAGE_DECLARATION(in2),
-    IMAGE_DECLARATION(out),
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out),
     const uint scale)
 {
     // Get pixels pointer
-    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
-    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
 
     // Load data
     VEC_DATA_TYPE(DATA_TYPE_RES, 16)
@@ -75,5 +98,5 @@
     in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
 
     // Perform multiplication and store result
-    vstore16(CONVERT_OP_INT(((in1_data * in2_data) >> scale), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+    vstore16(MUL_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, 16), 0, (__global DATA_TYPE_OUT *)out.ptr);
 }

diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 1902df9..99d7e6e 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl

@@ -23,29 +23,186 @@
  */
 #include "helpers.h"
 
-#if defined POOL_AVG
-#define POOL_OP(x, y) ((x) + (y))
-#else
-#define POOL_OP(x, y) (fmax((x), (y)))
-#endif
+#ifdef FIXED_POINT_POSITION
 
-float calculate_avg_scale(const int pool_size, const int upper_bound_w, const int upper_bound_h,
-                          const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+#include "fixed_point.h"
+
+#if defined(POOL_AVG)
+#define POOL_OP(x, y) add_sat(x, y)
+#else /* POOL_AVG */
+#define POOL_OP(x, y) (max((x), (y)))
+#endif /* POOL_AVG */
+
+#define DIV_OP1(x, y) DIV_SAT_OP_EXPAND((x), (y), DATA_TYPE, FIXED_POINT_POSITION)
+#define DIV_OP(x, y) DIV_OP1(x, y << FIXED_POINT_POSITION)
+#define SQRT_OP(x) DIV_OP1((1 << FIXED_POINT_POSITION), (INVSQRT_OP_EXPAND((x), DATA_TYPE, 1, FIXED_POINT_POSITION)))
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) MUL_SAT_OP_EXPAND((x), (x), DATA_TYPE, vec_size, FIXED_POINT_POSITION)
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#else /* FIXED_POINT_POSITION */
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#endif /* FIXED_POINT_POSITION */
+
+#if STRIDE_X == 1
+#define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#define POOLING3x3(res, input, output) POOLING3x3_STRIDE2(res, input, output)
+#elif STRIDE_X == 3 /* STRIDE_X not equals 1 or 2 */
+#define POOLING3x3(res, input, output) POOLING3x3_STRIDE3(res, input, output)
+#endif /* STRIDE_X == 3 */
+
+#define POOLING3x3_STRIDE1(res, input, output)                                                                                               \
+    ({                                                                                                                                       \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                          \
+        data00 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                                                                                                          \
+        data01 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 4);                                                      \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                          \
+        data10 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                                                                                                          \
+        data11 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 4);                                                      \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                          \
+        data20 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                                                                                                          \
+        data21 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4);                                                      \
+        data00 = POW2_OP(data00, 4);                                                                                                         \
+        data01 = POW2_OP(data01, 2);                                                                                                         \
+        data10 = POW2_OP(data10, 4);                                                                                                         \
+        data11 = POW2_OP(data11, 2);                                                                                                         \
+        data20 = POW2_OP(data20, 4);                                                                                                         \
+        data21 = POW2_OP(data21, 2);                                                                                                         \
+        \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                          \
+        values00 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data00.s01212323);                                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                          \
+        values01 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data01.s0, data00.s3, data01.s01);                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                          \
+        values10 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data10.s01212323);                                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                          \
+        values11 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data11.s0, data10.s3, data11.s01);                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                          \
+        values20 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data20.s01212323);                                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                          \
+        values21 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data21.s0, data20.s3, data21.s01);                                                          \
+        \
+        values00 = POOL_OP(values00, values10);                                                                                              \
+        values01 = POOL_OP(values01, values11);                                                                                              \
+        values00 = POOL_OP(values00, values20);                                                                                              \
+        values01 = POOL_OP(values01, values21);                                                                                              \
+        \
+        res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s147, values01.s2)); \
+        res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s25, values01.s03));                                                       \
+    })
+
+#define POOLING3x3_STRIDE2(res, input, output)                                                                                               \
+    ({                                                                                                                                       \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                          \
+        data00           = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                                                \
+        DATA_TYPE data01 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8);                                                    \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                          \
+        data10           = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                                                \
+        DATA_TYPE data11 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8);                                                    \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                          \
+        data20           = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                                                \
+        DATA_TYPE data21 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8);                                                    \
+        data00           = POW2_OP(data00, 8);                                                                                               \
+        data01           = POW2_OP(data01, 1);                                                                                               \
+        data10           = POW2_OP(data10, 8);                                                                                               \
+        data11           = POW2_OP(data11, 1);                                                                                               \
+        data20           = POW2_OP(data20, 8);                                                                                               \
+        data21           = POW2_OP(data21, 1);                                                                                               \
+        \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                          \
+        values00 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data00.s01223445);                                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                          \
+        values01 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s667, data01);                                                                       \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                          \
+        values10 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data10.s01223445);                                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                          \
+        values11 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data10.s667, data11);                                                                       \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                          \
+        values20 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data20.s01223445);                                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                          \
+        values21 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data20.s667, data21);                                                                       \
+        \
+        values00 = POOL_OP(values00, values10);                                                                                              \
+        values01 = POOL_OP(values01, values11);                                                                                              \
+        values00 = POOL_OP(values00, values20);                                                                                              \
+        values01 = POOL_OP(values01, values21);                                                                                              \
+        \
+        res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s147, values01.s2)); \
+        res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s25, values01.s03));                                                       \
+    })
+
+#define POOLING3x3_STRIDE3(res, input, output)                                                                                       \
+    ({                                                                                                                               \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                  \
+        data00 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                                                  \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                  \
+        data01 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8);                                              \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                  \
+        data10 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                                                  \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                  \
+        data11 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8);                                              \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                  \
+        data20 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                                                  \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                                  \
+        data21 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8);                                              \
+        data00 = POW2_OP(data00, 8);                                                                                                 \
+        data01 = POW2_OP(data01, 4);                                                                                                 \
+        data10 = POW2_OP(data10, 8);                                                                                                 \
+        data11 = POW2_OP(data11, 4);                                                                                                 \
+        data20 = POW2_OP(data20, 8);                                                                                                 \
+        data21 = POW2_OP(data21, 4);                                                                                                 \
+        \
+        data00 = POOL_OP(data00, data10);                                                                                            \
+        data01 = POOL_OP(data01, data11);                                                                                            \
+        data00 = POOL_OP(data00, data20);                                                                                            \
+        data01 = POOL_OP(data01, data21);                                                                                            \
+        \
+        res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s036, data01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s147, data01.s2)); \
+        res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s25, data01.s03));                                                   \
+    })
+
+DATA_TYPE calculate_avg_scale(const int pool_size, const int upper_bound_w, const int upper_bound_h,
+                              const int pad_x, const int pad_y, const int stride_x, const int stride_y)
 {
-    int start_x = get_global_id(0) * stride_x - pad_x;
-    int start_y = get_global_id(1) * stride_y - pad_y;
-    int end_x   = min(start_x + pool_size, upper_bound_w);
-    int end_y   = min(start_y + pool_size, upper_bound_h);
-    return 1.f / ((end_y - start_y) * (end_x - start_x));
+    const int start_x = get_global_id(0) * stride_x - pad_x;
+    const int start_y = get_global_id(1) * stride_y - pad_y;
+    const int end_x   = min(start_x + pool_size, upper_bound_w);
+    const int end_y   = min(start_y + pool_size, upper_bound_h);
+    return ((end_y - start_y) * (end_x - start_x));
 }
 
 /** Performs a pooling function of pool size equal to 2.
  *
- * @note Pooling stride must be passed using -DPOOL_STRIDE e.g -DPOOL_STRIDE=2. Supported strides are 1,2,3
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32;
- * @note In case of average pooling -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16, F32
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QS8/QS16/F16/F32
  * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
@@ -53,7 +210,7 @@
  * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: F16, F32
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
  * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
@@ -61,18 +218,10 @@
  * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  max_dims                             The maximum index that can be accessed in x and y dimension (width + pad)
- * @param[in]  strides                              The pooling operation strides in each dimension
- * @param[in]  paddings                             The pooling operation paddings in each dimension
  */
 __kernel void pooling_layer_2(
     TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output)
-#ifdef POOL_AVG
-    ,
-    int2 max_dims, int2 strides, int2 paddings
-#endif
-)
+    TENSOR3D_DECLARATION(output))
 {
     // Get pixels pointer
     Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
@@ -84,26 +233,40 @@
     VEC_DATA_TYPE(DATA_TYPE, 2)
     data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
 
+#if defined(POOL_L2)
+    // Raise to power of 2 for L2 Pooling
+    data0 = POW2_OP(data0, 2);
+    data1 = POW2_OP(data1, 2);
+#endif /* defined(POOL_L2) */
+
     // Perform calculations
     data0         = POOL_OP(data0, data1);
     DATA_TYPE res = POOL_OP(data0.s0, data0.s1);
 
-    // Divide by 4 in case of average pooling
-#ifdef POOL_AVG
-    res *= calculate_avg_scale(2, max_dims.x, max_dims.y, paddings.x, paddings.y, strides.x, strides.y);
-#endif
+#if defined(POOL_AVG) || defined(POOL_L2)
+    // Divide by pool region in case of average or l2 pooling
+    res = DIV_OP(res, calculate_avg_scale(2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
 
     // Store result
     *(__global DATA_TYPE *)output.ptr = res;
 }
 
-/** Performs a pooling function of pool size equal to 3.
+/** Performs a pooling function of pool size equal to 3
  *
- * @note Pooling stride must be passed using -DPOOL_STRIDE e.g -DPOOL_STRIDE=2. Supported strides are 1,2,3
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32;
- * @note In case of average pooling -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16, F32
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QS8/QS16/F16/F32
  * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
@@ -111,7 +274,7 @@
  * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: F16, F32
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
  * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
@@ -119,18 +282,10 @@
  * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  max_dims                             The maximum index that can be accessed in x and y dimension (width + pad)
- * @param[in]  strides                              The pooling operation strides in each dimension
- * @param[in]  paddings                             The pooling operation paddings in each dimension
  */
 __kernel void pooling_layer_3(
     TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output)
-#ifdef POOL_AVG
-    ,
-    int2 max_dims, int2 strides, int2 paddings
-#endif
-)
+    TENSOR3D_DECLARATION(output))
 {
     // Get pixels pointer
     Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
@@ -144,16 +299,306 @@
     VEC_DATA_TYPE(DATA_TYPE, 3)
     data2 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
 
+#if defined(POOL_L2)
+    // Raise to power of 2 for L2 Pooling
+    data0 = POW2_OP(data0, 3);
+    data1 = POW2_OP(data1, 3);
+    data2 = POW2_OP(data2, 3);
+#endif /* defined(POOL_L2) */
+
     // Perform calculations
     data0         = POOL_OP(data0, data1);
     data0         = POOL_OP(data0, data2);
     DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2);
 
-    // Divide by 4 in case of average pooling
-#ifdef POOL_AVG
-    res *= calculate_avg_scale(3, max_dims.x, max_dims.y, paddings.x, paddings.y, strides.x, strides.y);
-#endif
+#if defined(POOL_AVG) || defined(POOL_L2)
+    // Divide by pool region in case of average pooling
+    res = DIV_OP(res, calculate_avg_scale(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
 
     // Store result
     *(__global DATA_TYPE *)output.ptr = res;
 }
+
+#if defined(POOLING3x3) && !defined(FIXED_POINT_POSITION)
+
+#define CONVERT_OP(data_type) convert_##data_type##4
+#define CONVERT_VECTOR4(data_type) CONVERT_OP(data_type)
+
+VEC_DATA_TYPE(DATA_TYPE, 4)
+calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upper_bound_h,
+                     const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    const int4 start_x = ((int4)get_global_id(0) * 4 + (int4)(0, 1, 2, 3)) * (int4)stride_x - (int4)pad_x;
+    const int  start_y = get_global_id(1) * stride_y - pad_y;
+    const int4 end_x   = min(start_x + (int4)pool_size, (int4)upper_bound_w);
+    const int  end_y   = min(start_y + pool_size, upper_bound_h);
+    return (VEC_DATA_TYPE(DATA_TYPE, 4))(1.f) / CONVERT_VECTOR4(DATA_TYPE)(((int4)(end_y - start_y)) * (end_x - start_x));
+}
+
+/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_3_optimized(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    res;
+
+    // Perform pooling 3x3 for 4 output elements
+    POOLING3x3(res, input, output);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    // Divide by pool region in case of average pooling
+    res *= calculate_avg_scale4(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    vstore4(res, 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif // defined(POOLING3x3) && !defined(FIXED_POINT_POSITION)
+
+/** Performs a pooling function of pool size equal to 7.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QS8/QS16/F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_7(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data1 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data2 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data3 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data4 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data5 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data6 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6, 0));
+
+#if defined(POOL_L2)
+    // Raise to power of 2 for L2 Pooling
+    data0 = POW2_OP(data0, 8);
+    data1 = POW2_OP(data1, 8);
+    data2 = POW2_OP(data2, 8);
+    data3 = POW2_OP(data3, 8);
+    data4 = POW2_OP(data4, 8);
+    data5 = POW2_OP(data5, 8);
+    data6 = POW2_OP(data6, 8);
+#endif /* defined(POOL_L2) */
+
+    // Pool operation of all rows
+    data0 = POOL_OP(data0, data1);
+    data2 = POOL_OP(data2, data3);
+    data4 = POOL_OP(data4, data5);
+    data0 = POOL_OP(data0, data2);
+    data4 = POOL_OP(data4, data6);
+    data0 = POOL_OP(data0, data4);
+
+    // Set last element
+#if defined(POOL_AVG) || defined(POOL_L2)
+    data0.s7 = 0;
+#else  /* defined(POOL_AVG) || defined(POOL_L2) */
+    data0.s7 = data0.s6;
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+    // Reduce result
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    reduce4 = POOL_OP(data0.s0123, data0.s4567);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    reduce2       = POOL_OP(reduce4.s01, reduce4.s23);
+    DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    // Divide by pool region in case of average pooling
+    res = DIV_OP(res, calculate_avg_scale(7, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    *(__global DATA_TYPE *)output.ptr = res;
+}
+
+#if defined(POOL_SIZE)
+
+// Set the initial value for the pooling operation accordingly with the data type
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define INITIAL_VALUE 0
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#ifdef FIXED_POINT_POSITION
+#define MIN_VAL_EXPAND(type) type##_MIN
+#define MIN_VAL(type) MIN_VAL_EXPAND(type)
+#define INITIAL_VALUE MIN_VAL(DATA_TYPE)
+#define INITIAL_VALUE 0
+#else // FIXED_POINT_POSITION
+#if FP16
+#define INITIAL_VALUE -HALF_MAX
+#else // FP16
+#define INITIAL_VALUE -FLT_MAX
+#endif // FP16
+#endif // FIXED_POINT_POSITION
+
+#endif // POOL_AVG
+
+/** Performs a pooling function of pool size equal to N
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
+ * @note -DFP16 must be passed at compile time if half float data type is used
+ * @note Pool size must be passed using -DPOOL_SIZE e.g. -DPOOL_SIZE=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_N(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    vdata           = INITIAL_VALUE;
+    DATA_TYPE sdata = INITIAL_VALUE;
+
+    // Load data
+    for(int y = 0; y < POOL_SIZE; y++)
+    {
+        int x = 0;
+        for(; x <= ((int)POOL_SIZE - 8); x += 8)
+        {
+            VEC_DATA_TYPE(DATA_TYPE, 8)
+            data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data0 *= data0;
+#endif /* defined(POOL_L2) */
+            vdata = POOL_OP(vdata, data0);
+        }
+
+        // Leftover
+        for(; x < (int)POOL_SIZE; ++x)
+        {
+            DATA_TYPE data0 = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data0 *= data0;
+#endif /* defined(POOL_L2) */
+            sdata = POOL_OP(sdata, data0);
+        }
+    }
+
+    // Reduce result
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    reduce2       = POOL_OP(reduce4.s01, reduce4.s23);
+    DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
+    res           = POOL_OP(res, sdata);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    // Divide by pool region in case of average pooling
+    res = DIV_OP(res, calculate_avg_scale(POOL_SIZE, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    *(__global DATA_TYPE *)output.ptr = res;
+}
+#endif // defined(POOL_SIZE)
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/quantization_layer.cl b/src/core/CL/cl_kernels/quantization_layer.cl
new file mode 100644
index 0000000..80ea540
--- /dev/null
+++ b/src/core/CL/cl_kernels/quantization_layer.cl

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This performs the quantization of floating point inputs to 8-bit unsigned integers.
+ *
+ * @param[in]  input_ptr                             Pointer to the source image. Supported data types: F32
+ * @param[in]  input_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source image
+ * @param[out] output_ptr                            Pointer to the destination image. Supported data types: U8
+ * @param[in]  output_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination image
+ * @param[in]  min_max_ptr                           Pointer to the min/max vector. Minimum value in position 0, maximum value in position 1. Supported data types: F32.
+ * @param[in]  min_max_stride_x                      Stride of the min/max vector in X dimension (in bytes)
+ * @param[in]  min_max_step_x                        min_max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  min_max_offset_first_element_in_bytes The offset of the first element in the min/max vector
+ */
+__kernel void quantization_layer(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    VECTOR_DECLARATION(min_max))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    // min_max_value.s0 = min, min_max_value.s1 = max
+    const float2 min_max_value = vload2(0, (__global float *)(min_max_ptr + min_max_offset_first_element_in_bytes));
+
+    const float4 vmin   = (float4)min_max_value.s0;
+    const float4 vrange = (float4)(min_max_value.s1 - min_max_value.s0);
+
+    // Load data
+    float4 data = vload4(0, (__global float *)input.ptr);
+
+    // Map float values to range [0.0, 1.0]
+    data = (data - vmin) / vrange;
+
+    // Quantize and saturate
+    uchar4 res = convert_uchar4_sat(data * 256.0f);
+
+    // Store result
+    vstore4(res, 0, (__global uchar *)output.ptr);
+}

diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
new file mode 100644
index 0000000..d46a226
--- /dev/null
+++ b/src/core/CL/cl_kernels/reduction_operation.cl

@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Calculate square sum of a vector
+ *
+ * @param[in] input Pointer to the first pixel.
+ *
+ * @return square sum of vector.
+ */
+inline DATA_TYPE square_sum(__global const DATA_TYPE *input)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = vload16(0, input);
+
+    in *= in;
+
+    in.s01234567 += in.s89ABCDEF;
+    in.s0123 += in.s4567;
+    in.s01 += in.s23;
+
+    return (in.s0 + in.s1);
+}
+
+/** Calculate sum of a vector
+ *
+ * @param[in] input Pointer to the first pixel.
+ *
+ * @return sum of vector.
+ */
+inline DATA_TYPE sum(__global const DATA_TYPE *input)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = vload16(0, input);
+
+    in.s01234567 += in.s89ABCDEF;
+    in.s0123 += in.s4567;
+    in.s01 += in.s23;
+
+    return (in.s0 + in.s1);
+}
+
+/** This kernel performs reduction given an operation.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
+ * @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum
+ *
+ * @param[in] src_ptr                                   Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x                              Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                                src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes         The offset of the first element in the source tensor
+ * @param[in] partial_sum_ptr                           The local buffer to hold sumed values. Supported data types: same as @p src_ptt
+ * @param[in] partial_sum_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in] partial_sum_step_x                        partial_sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] partial_sum_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] local_sums                                Local buffer for storing the partioal sum
+ */
+__kernel void reduction_operation(
+    VECTOR_DECLARATION(src),
+    VECTOR_DECLARATION(partial_sum),
+    __local DATA_TYPE *local_sums)
+{
+    Vector src         = CONVERT_TO_VECTOR_STRUCT(src);
+    Vector partial_sum = CONVERT_TO_VECTOR_STRUCT(partial_sum);
+
+    unsigned int lsize = get_local_size(0);
+    unsigned int lid   = get_local_id(0);
+
+    local_sums[lid] = OPERATION((__global DATA_TYPE *)src.ptr);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Perform parallel reduction
+    for(unsigned int i = lsize >> 1; i > 0; i >>= 1)
+    {
+        if(lid < i)
+        {
+            local_sums[lid] += local_sums[lid + i];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if(lid == 0)
+    {
+        ((__global DATA_TYPE *)partial_sum.ptr + get_group_id(0))[0] = local_sums[0];
+    }
+}
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/reshape_layer.cl b/src/core/CL/cl_kernels/reshape_layer.cl
new file mode 100644
index 0000000..23eccbf
--- /dev/null
+++ b/src/core/CL/cl_kernels/reshape_layer.cl

@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Perform tensor reshape
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: U8/S8/QS8/U16/S16/QS16/U32/S32/F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  input_shape                          Input spatial shape
+ * @param[in]  output_shape                         Output spatial shape
+ */
+__kernel void reshape_layer(TENSOR3D_DECLARATION(input),
+                            TENSOR3D_DECLARATION(output),
+                            int2 input_shape,
+                            int2 output_shape)
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+    int3 id = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
+
+    // Linearize index
+    int linear_idx = id.x + id.y * input_shape.x + id.z * input_shape.x * input_shape.y;
+
+    // Translate to output
+    int3 out_id;
+    out_id.x = linear_idx % output_shape.x;
+    out_id.y = (linear_idx / output_shape.x) % output_shape.y;
+    out_id.z = linear_idx / (output_shape.x * output_shape.y);
+
+    // Store result
+    *((__global DATA_TYPE *)tensor3D_offset(&out, out_id.x, out_id.y, out_id.z)) = *((__global DATA_TYPE *)in.ptr);
+}

diff --git a/src/core/CL/cl_kernels/roi_pooling_layer.cl b/src/core/CL/cl_kernels/roi_pooling_layer.cl
new file mode 100644
index 0000000..042b102
--- /dev/null
+++ b/src/core/CL/cl_kernels/roi_pooling_layer.cl

@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if DATA_SIZE == 32
+#define VEC_SIZE 4
+#define VEC_MAX vec4_max
+#elif DATA_SIZE == 16
+#define VEC_SIZE 8
+#define VEC_MAX vec8_max
+#else /* DATA_SIZE not equals 32 or 16 */
+#error "Unsupported data size"
+#endif /* DATA_SIZE == 32 */
+
+inline DATA_TYPE vec4_max(VEC_DATA_TYPE(DATA_TYPE, 4) vec)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    temp = fmax(vec.lo, vec.hi);
+    return fmax(temp.x, temp.y);
+}
+
+inline DATA_TYPE vec8_max(VEC_DATA_TYPE(DATA_TYPE, 8) vec)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    temp = fmax(vec.lo, vec.hi);
+    return vec4_max(temp);
+}
+
+/** Performs a roi pooling on a single output pixel.
+ *
+ * @param[in] input          Pointer to input Tensor3D struct.
+ * @param[in] region_start_x Start x index projected onto the input tensor.
+ * @param[in] region_end_x   End x index projected onto the input tensor.
+ * @param[in] region_start_y Start y index projected onto the input tensor.
+ * @param[in] region_end_y   End y index projected onto the input tensor.
+ * @param[in] pz             z index of the input tensor.
+ *
+ * @return A max pooled value from the region specified in the input tensor.
+ */
+inline DATA_TYPE roi_pool_1x1(const Tensor3D *input, int region_start_x, int region_end_x, int region_start_y, int region_end_y, int pz)
+{
+    // Iterate through the pooling region
+    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+    {
+        return (DATA_TYPE)0;
+    }
+    else
+    {
+        int num_iter = (int)((region_end_x - region_start_x) / VEC_SIZE);
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        curr_max = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(-FLT_MAX);
+        for(int j = region_start_y; j < region_end_y; ++j)
+        {
+            int i = region_start_x;
+            for(; i < region_start_x + num_iter * VEC_SIZE; i += VEC_SIZE)
+            {
+                VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+                val      = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(input, i, j, pz));
+                curr_max = fmax(val, curr_max);
+            }
+            for(; i < region_end_x; ++i)
+            {
+                DATA_TYPE val = *(__global DATA_TYPE *)tensor3D_offset(input, i, j, pz);
+                curr_max      = fmax(curr_max, val);
+            }
+        }
+        return (DATA_TYPE)VEC_MAX(curr_max);
+    }
+}
+
+/** Performs a roi pooling function.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32;
+ * @note Datasize must be passed using -DDATA_SIZE e.g. -DDATA_SIZE=32;
+ * @note Input dimensions must be passed using -DMAX_DIM_X, -DMAX_DIM_Y and -DMAX_DIM_Z;
+ * @note Pooled region dimensions must be passed using -DPOOLED_DIM_X and -DPOOLED_DIM_Y;
+ * @note Spatial scale must be passed using -DSPATIAL_SCALE;
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16, F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the pooled region of the source image as specifed by ROI
+ * @param[in]  rois_ptr                             Pointer to the rois array. Layout: {x, y, width, height, batch_indx}
+ * @param[in]  rois_stride_x                        Stride of the rois array in X dimension (in bytes)
+ * @param[in]  rois_step_x                          rois_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the rois array
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: F16, F32
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_stride_w                       Stride of the source image in W dimension (in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination image in W dimension (in bytes)
+ */
+__kernel void roi_pooling_layer(
+    TENSOR3D_DECLARATION(input),
+    VECTOR_DECLARATION(rois),
+    TENSOR3D_DECLARATION(output),
+    unsigned int input_stride_w, unsigned int output_stride_w)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    Vector   rois   = CONVERT_TO_VECTOR_STRUCT_NO_STEP(rois);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+    const int px = get_global_id(0);
+    const int py = get_global_id(1);
+    const int pw = get_global_id(2);
+
+    // Load roi parameters
+    // roi is laid out as follows:
+    // { x, y, width, height, batch_index }
+    const ushort4 roi      = vload4(0, (__global ushort *)vector_offset(&rois, pw));
+    const ushort roi_batch = *((__global ushort *)vector_offset(&rois, pw) + 4);
+    const int2 roi_anchor  = convert_int2_sat(round(convert_float2(roi.s01) * (float)SPATIAL_SCALE));
+    const int2 roi_dims    = convert_int2_sat(fmax(round(convert_float2(roi.s23) * (float)SPATIAL_SCALE), 1.f));
+
+    // Calculate pooled region start and end
+    const float2 spatial_indx     = (float2)(px, py);
+    const float2 pooled_dims      = (float2)(POOLED_DIM_X, POOLED_DIM_Y);
+    const int2   max_spatial_dims = (int2)(MAX_DIM_X, MAX_DIM_Y);
+    int2         region_start     = convert_int2_sat(floor(spatial_indx / pooled_dims * convert_float2(roi_dims))) + roi_anchor;
+    int2         region_end       = convert_int2_sat(floor((spatial_indx + 1) / pooled_dims * convert_float2(roi_dims))) + roi_anchor;
+
+    region_start = clamp(region_start, 0, max_spatial_dims);
+    region_end   = clamp(region_end, 0, max_spatial_dims);
+
+    // Move input and output pointer across the fourth dimension
+    input.ptr += roi_batch * input_stride_w;
+    output.ptr += pw * output_stride_w;
+
+    for(int pz = 0; pz < MAX_DIM_Z; ++pz)
+    {
+        *(__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz) = (__global DATA_TYPE)roi_pool_1x1(&input,
+                                                                                                       region_start.x,
+                                                                                                       region_end.x,
+                                                                                                       region_start.y,
+                                                                                                       region_end.y, pz);
+    }
+}

diff --git a/src/core/CL/cl_kernels/scale.cl b/src/core/CL/cl_kernels/scale.cl
index 9ef33b8..0106ce0 100644
--- a/src/core/CL/cl_kernels/scale.cl
+++ b/src/core/CL/cl_kernels/scale.cl

@@ -70,20 +70,20 @@
  * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
  * @param[in]  input_width                       Input image width
  * @param[in]  input_height                      Input image height
- * @param[in]  output_width                      Output image width
- * @param[in]  output_height                     Output image height
+ * @param[in]  scale_x                           The scale factor along x dimension
+ * @param[in]  scale_y                           The scale factor along y dimension
  */
 __kernel void scale_nearest_neighbour(
     IMAGE_DECLARATION(in),
     IMAGE_DECLARATION(out),
     const float input_width,
     const float input_height,
-    const float output_width,
-    const float output_height)
+    const float scale_x,
+    const float scale_y)
 {
     Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
     Image        out = CONVERT_TO_IMAGE_STRUCT(out);
-    const float2 r   = (float2)(input_width / output_width, input_height / output_height);
+    const float2 r   = (float2)(scale_x, scale_y);
     const float8 tc  = clamp_to_border(transform_nearest(get_current_coords(), r), input_width, input_height);
     vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr);
 }
@@ -104,20 +104,20 @@
  * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
  * @param[in]  input_width                       Input image width
  * @param[in]  input_height                      Input image height
- * @param[in]  output_width                      Output image width
- * @param[in]  output_height                     Output image height
+ * @param[in]  scale_x                           The scale factor along x dimension
+ * @param[in]  scale_y                           The scale factor along y dimension
  */
 __kernel void scale_bilinear(
     IMAGE_DECLARATION(in),
     IMAGE_DECLARATION(out),
     const float input_width,
     const float input_height,
-    const float output_width,
-    const float output_height)
+    const float scale_x,
+    const float scale_y)
 {
     Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
     Image        out = CONVERT_TO_IMAGE_STRUCT(out);
-    const float2 r   = (float2)(input_width / output_width, input_height / output_height);
-    const float8 tc  = clamp_to_border(transform_bilinear(get_current_coords(), r), input_width, input_height);
+    const float2 r   = (float2)(scale_x, scale_y);
+    const float8 tc  = transform_bilinear(get_current_coords(), r);
     vstore4(bilinear_interpolate(&in, tc, input_width, input_height), 0, (__global DATA_TYPE *)out.ptr);
 }

diff --git a/src/core/CL/cl_kernels/scharr_filter.cl b/src/core/CL/cl_kernels/scharr_filter.cl
index ef9878c..d9b5d07 100644
--- a/src/core/CL/cl_kernels/scharr_filter.cl
+++ b/src/core/CL/cl_kernels/scharr_filter.cl

@@ -52,28 +52,28 @@
 #ifdef GRAD_X
     ,
     IMAGE_DECLARATION(dst_gx)
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     ,
     IMAGE_DECLARATION(dst_gy)
-#endif
+#endif /* GRAD_Y */
 )
 {
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
 #ifdef GRAD_X
     Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif
+#endif /* GRAD_Y */
 
     // Output pixels
 #ifdef GRAD_X
     short8 gx = (short8)0;
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     short8 gy = (short8)0;
-#endif
+#endif /* GRAD_Y */
 
     // Row0
     uchar16 temp   = vload16(0, offset(&src, -1, -1));
@@ -83,12 +83,12 @@
 #ifdef GRAD_X
     gx += left * (short8)(-3);
     gx += right * (short8)(+3);
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     gy += left * (short8)(-3);
     gy += middle * (short8)(-10);
     gy += right * (short8)(-3);
-#endif
+#endif /* GRAD_Y */
 
     // Row1
     temp  = vload16(0, offset(&src, -1, 0));
@@ -97,7 +97,7 @@
 #ifdef GRAD_X
     gx += left * (short8)(-10);
     gx += right * (short8)(+10);
-#endif
+#endif /* GRAD_X */
 
     // Row2
     temp   = vload16(0, offset(&src, -1, 1));
@@ -107,18 +107,18 @@
 #ifdef GRAD_X
     gx += left * (short8)(-3);
     gx += right * (short8)(+3);
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     gy += left * (short8)(+3);
     gy += middle * (short8)(+10);
     gy += right * (short8)(+3);
-#endif
+#endif /* GRAD_Y */
 
     // Store results
 #ifdef GRAD_X
     vstore8(gx, 0, ((__global short *)dst_gx.ptr));
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     vstore8(gy, 0, ((__global short *)dst_gy.ptr));
-#endif
+#endif /* GRAD_Y */
 }

diff --git a/src/core/CL/cl_kernels/sobel_filter.cl b/src/core/CL/cl_kernels/sobel_filter.cl
index 4eb0eef..fc2b0ee 100644
--- a/src/core/CL/cl_kernels/sobel_filter.cl
+++ b/src/core/CL/cl_kernels/sobel_filter.cl

@@ -56,28 +56,28 @@
 #ifdef GRAD_X
     ,
     IMAGE_DECLARATION(dst_gx)
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     ,
     IMAGE_DECLARATION(dst_gy)
-#endif
+#endif /* GRAD_Y */
 )
 {
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
 #ifdef GRAD_X
     Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif
+#endif /* GRAD_Y */
 
     // Output pixels
 #ifdef GRAD_X
     short8 gx = (short8)0;
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     short8 gy = (short8)0;
-#endif
+#endif /* GRAD_Y */
 
     // Row0
     uchar16 temp   = vload16(0, offset(&src, -1, -1));
@@ -87,12 +87,12 @@
 #ifdef GRAD_X
     gx += left * (short8)(-1);
     gx += right * (short8)(+1);
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     gy += left * (short8)(-1);
     gy += middle * (short8)(-2);
     gy += right * (short8)(-1);
-#endif
+#endif /* GRAD_Y */
 
     // Row1
     temp  = vload16(0, offset(&src, -1, 0));
@@ -101,7 +101,7 @@
 #ifdef GRAD_X
     gx += left * (short8)(-2);
     gx += right * (short8)(+2);
-#endif
+#endif /* GRAD_X */
 
     // Row2
     temp   = vload16(0, offset(&src, -1, 1));
@@ -111,20 +111,20 @@
 #ifdef GRAD_X
     gx += left * (short8)(-1);
     gx += right * (short8)(+1);
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     gy += left * (short8)(+1);
     gy += middle * (short8)(+2);
     gy += right * (short8)(+1);
-#endif
+#endif /* GRAD_Y */
 
     // Store results
 #ifdef GRAD_X
     vstore8(gx, 0, ((__global short *)dst_gx.ptr));
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     vstore8(gy, 0, ((__global short *)dst_gy.ptr));
-#endif
+#endif /* GRAD_Y */
 }
 
 /**********************************************/
@@ -261,20 +261,20 @@
 #ifdef GRAD_X
     ,
     IMAGE_DECLARATION(dst_gx)
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     ,
     IMAGE_DECLARATION(dst_gy)
-#endif
+#endif /* GRAD_Y */
 )
 {
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
 #ifdef GRAD_X
     Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif
+#endif /* GRAD_Y */
 
     // Output pixels
     short16 gx_gy = sobel1x5(&src,
@@ -284,10 +284,10 @@
     // Store result in dst
 #ifdef GRAD_X
     vstore8(gx_gy.s01234567, 0, ((__global short *)dst_gx.ptr));
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     vstore8(gx_gy.s89ABCDEF, 0, ((__global short *)dst_gy.ptr));
-#endif
+#endif /* GRAD_Y */
 }
 
 /** Apply a 5x1 convolution matrix to two single channel S16 input temporary images
@@ -326,32 +326,32 @@
 #ifdef GRAD_X
     IMAGE_DECLARATION(src_x),
     IMAGE_DECLARATION(dst_gx),
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     IMAGE_DECLARATION(src_y),
     IMAGE_DECLARATION(dst_gy),
-#endif
+#endif /* GRAD_Y */
     int dummy)
 {
 #ifdef GRAD_X
     Image src_x  = CONVERT_TO_IMAGE_STRUCT(src_x);
     Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     Image src_y  = CONVERT_TO_IMAGE_STRUCT(src_y);
     Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif
+#endif /* GRAD_Y */
 
 #ifdef GRAD_X
     short8 gx = sobel5x1(&src_x,
                          1, 4, 6, 4, 1);
     vstore8(gx, 0, ((__global short *)dst_gx.ptr));
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     short8 gy = sobel5x1(&src_y,
                          -1, -2, 0, 2, 1);
     vstore8(gy, 0, ((__global short *)dst_gy.ptr));
-#endif
+#endif /* GRAD_Y */
 }
 
 /**********************************************/
@@ -444,20 +444,20 @@
 #ifdef GRAD_X
     ,
     IMAGE_DECLARATION(dst_gx)
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     ,
     IMAGE_DECLARATION(dst_gy)
-#endif
+#endif /* GRAD_Y */
 )
 {
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
 #ifdef GRAD_X
     Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif
+#endif /* GRAD_Y */
     int8 gx = (int8)0;
     int8 gy = (int8)0;
 
@@ -466,10 +466,10 @@
     // Store result in dst
 #ifdef GRAD_X
     vstore8(gx, 0, ((__global int *)dst_gx.ptr));
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     vstore8(gy, 0, ((__global int *)dst_gy.ptr));
-#endif
+#endif /* GRAD_Y */
 }
 
 /** Apply a 7x1 convolution matrix to two single channel S16 input temporary images and output two single channel S16 images and leave the borders undefined.
@@ -507,33 +507,33 @@
 #ifdef GRAD_X
     IMAGE_DECLARATION(src_x),
     IMAGE_DECLARATION(dst_gx),
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     IMAGE_DECLARATION(src_y),
     IMAGE_DECLARATION(dst_gy),
-#endif
+#endif /* GRAD_Y */
     int dummy)
 {
 #ifdef GRAD_X
     Image src_x  = CONVERT_TO_IMAGE_STRUCT(src_x);
     Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     Image src_y  = CONVERT_TO_IMAGE_STRUCT(src_y);
     Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif
+#endif /* GRAD_Y */
 
     // Output pixels
 #ifdef GRAD_X
     int8 gx = 0;
     SOBEL7x1(&src_x, gx, Y);
     vstore8(gx, 0, (__global int *)dst_gx.ptr);
-#endif
+#endif /* GRAD_X */
 #ifdef GRAD_Y
     int8 gy = 0;
     SOBEL7x1(&src_y, gy, X);
     vstore8(gy, 0, (__global int *)dst_gy.ptr);
-#endif
+#endif /* GRAD_Y */
 }
 
 /**********************************************/

diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl
index 632b4a5..9b24380 100644
--- a/src/core/CL/cl_kernels/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/softmax_layer.cl

@@ -23,15 +23,37 @@
  */
 #include "helpers.h"
 
-#if defined USE_F16
-#define MINVAL HALF_MIN
+#ifdef FIXED_POINT_POSITION
+
+#include "fixed_point.h"
+#define MAX_OP(x, y, type, size) MAX_OP_EXPAND(x, y, type, size)
+#define ADD_OP(x, y, type, size) ADD_SAT_OP_EXPAND((x), (y), type, size)
+#define SUB_OP(x, y, type, size) SUB_SAT_OP_EXPAND((x), (y), type, size)
+#define DIV_OP(x, y, type, size) DIV_SAT_OP_VEC_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
+#define EXP_OP(x, type, size) EXP_OP_EXPAND((x), type, size, FIXED_POINT_POSITION)
+
+#define MIN_VAL_EXPAND(type) type##_MIN
+#define MIN_VAL(type) MIN_VAL_EXPAND(type)
+#define MINVAL MIN_VAL(DATA_TYPE)
+#define SELECT_DATA_TYPE EXPAND(DATA_TYPE)
+
+#else /* FIXED_POINT_POSITION */
+
+#define MAX_OP(x, y, type, size) max((x), (y))
+#define ADD_OP(x, y, type, size) ((x) + (y))
+#define SUB_OP(x, y, type, size) ((x) - (y))
+#define DIV_OP(x, y, type, size) ((x) / (y))
+#define EXP_OP(x, type, size) exp((x))
+
+#ifdef USE_F16
+#define MINVAL -HALF_MAX
 #define SELECT_DATA_TYPE short
-#define DATA_TYPE half
-#else
-#define MINVAL FLT_MIN
+#else /* USE_F16 */
+#define MINVAL -FLT_MAX
 #define SELECT_DATA_TYPE int
-#define DATA_TYPE float
-#endif
+#endif /* USE_F16 */
+
+#endif /* FIXED_POINT_POSITION */
 
 __constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
 __constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
@@ -39,30 +61,34 @@
 /** Identifies the maximum value across the 1st dimension.
  *
  * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note In case F16 is used -DUSE_HALF must be passed otherwise the kernel will default to used F32.
+ * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
  * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16, F32
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: F16, F32
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  * @param[in]  width                             Input image width
  */
 __kernel void softmax_layer_max(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
     uint width)
 {
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
 
     // Initialize local maximum
     VEC_DATA_TYPE(DATA_TYPE, 16)
@@ -74,23 +100,23 @@
     {
         VEC_DATA_TYPE(DATA_TYPE, 16)
         data    = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0));
-        max_val = max(data, max_val);
+        max_val = MAX_OP(data, max_val, DATA_TYPE, 16);
     }
 
-#if defined NON_MULTIPLE_OF_16
+#ifdef NON_MULTIPLE_OF_16
     // Handle non multiple of 16
     VEC_DATA_TYPE(DATA_TYPE, 16)
     data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0));
     VEC_DATA_TYPE(SELECT_DATA_TYPE, 16)
     widx    = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 16));
-    max_val = max(max_val, select(type_min, data, widx));
-#endif
+    max_val = MAX_OP(max_val, select(type_min, data, widx), DATA_TYPE, 16);
+#endif /* NON_MULTIPLE_OF_16 */
 
     // Perform max reduction
-    max_val.s01234567 = max(max_val.s01234567, max_val.s89ABCDEF);
-    max_val.s0123     = max(max_val.s0123, max_val.s4567);
-    max_val.s01       = max(max_val.s01, max_val.s23);
-    max_val.s0        = max(max_val.s0, max_val.s1);
+    max_val.s01234567 = MAX_OP(max_val.s01234567, max_val.s89ABCDEF, DATA_TYPE, 8);
+    max_val.s0123     = MAX_OP(max_val.s0123, max_val.s4567, DATA_TYPE, 4);
+    max_val.s01       = MAX_OP(max_val.s01, max_val.s23, DATA_TYPE, 2);
+    max_val.s0        = MAX_OP(max_val.s0, max_val.s1, DATA_TYPE, 1);
 
     // Store result
     *((__global DATA_TYPE *)dst.ptr) = max_val.s0;
@@ -100,46 +126,54 @@
  * then gets the exponent of each element as sums all elements across each row.
  *
  * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note In case F16 is used -DUSE_HALF must be passed otherwise the kernel will default to used F32.
+ * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
  * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16, F32
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  max_ptr                           Pointer to the max values tensor slice. Supported data types: F16, F32
+ * @param[in]  max_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
  * @param[in]  max_stride_x                      Stride of the max values tensor in X dimension (in bytes)
  * @param[in]  max_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  max_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
  * @param[in]  max_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  max_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
+ * @param[in]  max_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  max_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: F16, F32
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] sum_ptr                           Pointer to the sum values tensor slice. Supported data types: F16, F32
+ * @param[out] sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
  * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
  * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_step_y                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
  * @param[in]  width                             Input image width
  */
 __kernel void softmax_layer_shift_exp_sum(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(max),
-    IMAGE_DECLARATION(dst),
-    IMAGE_DECLARATION(sum),
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(max),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(sum),
     uint width)
 {
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-    Image max = CONVERT_TO_IMAGE_STRUCT(max);
-    Image sum = CONVERT_TO_IMAGE_STRUCT(sum);
+    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image max = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(max);
+    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
 
     // Load max value of 1D logits vector (row)
     DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&max, 0, 0));
@@ -154,28 +188,30 @@
     {
         VEC_DATA_TYPE(DATA_TYPE, 16)
         data = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0));
-        data = exp(data - max_val);
+        data = SUB_OP(data, max_val, DATA_TYPE, 16);
+        data = EXP_OP(data, DATA_TYPE, 16);
         vstore16(data, 0, (__global DATA_TYPE *)offset(&dst, i << 4, 0));
-        sum1D += data;
+        sum1D = ADD_OP(sum1D, data, DATA_TYPE, 16);
     }
 
-#if defined NON_MULTIPLE_OF_16
+#ifdef NON_MULTIPLE_OF_16
     // Handle non multiple of 16
     VEC_DATA_TYPE(DATA_TYPE, 16)
     data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0));
-    data = exp(data - max_val);
+    data = SUB_OP(data, max_val, DATA_TYPE, 16);
+    data = EXP_OP(data, DATA_TYPE, 16);
     VEC_DATA_TYPE(SELECT_DATA_TYPE, 16)
     widx = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 16));
     data = select(0, data, widx);
     vstore16(data, 0, (__global DATA_TYPE *)offset(&dst, width4 << 4, 0));
-    sum1D += data;
-#endif
+    sum1D = ADD_OP(sum1D, data, DATA_TYPE, 16);
+#endif /* NON_MULTIPLE_OF_16 */
 
     // Perform min/max reduction
-    sum1D.s01234567 = sum1D.s01234567 + sum1D.s89ABCDEF;
-    sum1D.s0123     = sum1D.s0123 + sum1D.s4567;
-    sum1D.s01       = sum1D.s01 + sum1D.s23;
-    sum1D.s0        = sum1D.s0 + sum1D.s1;
+    sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);
+    sum1D.s0123     = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);
+    sum1D.s01       = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
+    sum1D.s0        = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
 
     // Calculate and store result
     *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
@@ -184,38 +220,45 @@
 /** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
  *
  * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
  *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16, F32
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: F16, F32
+ * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
  * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
  * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
  * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: F16, F32
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void softmax_layer_norm(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(sum),
-    IMAGE_DECLARATION(dst))
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(sum),
+    TENSOR3D_DECLARATION(dst))
 {
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-    Image sum = CONVERT_TO_IMAGE_STRUCT_NO_STEP(sum);
+    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
 
     // Load max value of 1D logits vector (row)
     DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));
     VEC_DATA_TYPE(DATA_TYPE, 16)
     data = vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0));
-    vstore16(data / sum_val, 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
+    vstore16(DIV_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
 }

diff --git a/src/core/CL/cl_kernels/transpose.cl b/src/core/CL/cl_kernels/transpose.cl
index c30158f..c993005 100644
--- a/src/core/CL/cl_kernels/transpose.cl
+++ b/src/core/CL/cl_kernels/transpose.cl

@@ -98,7 +98,10 @@
 
 #ifndef DATA_TYPE_IN_BYTES
 #error DATA_TYPE_IN_BYTES not set for the transpose OpenCL kernel
-#endif
+#endif /* not DATA_TYPE_IN_BYTES */
+
+#undef VLOAD
+#undef VSTORE
 
 #if DATA_TYPE_IN_BYTES == 4
 #define DATA_TYPE uint
@@ -118,9 +121,9 @@
 #define VLOAD(x, y) vload16(x, y)
 #define VSTORE(x, y, z) vstore16(x, y, z)
 #define BLOCK_SIZE 16
-#else
+#else /* switch DATA_TYPE_IN_BYTES */
 #error DATA_TYPE_IN_BYTES not supported for transpose
-#endif
+#endif /* switch DATA_TYPE_IN_BYTES */
 
 /** This OpenCL kernel computes the matrix transposition of input matrix
  *

diff --git a/src/core/CL/cl_kernels/warp_perspective.cl b/src/core/CL/cl_kernels/warp_perspective.cl
index 863b6c9..d955e42 100644
--- a/src/core/CL/cl_kernels/warp_perspective.cl
+++ b/src/core/CL/cl_kernels/warp_perspective.cl

@@ -92,7 +92,7 @@
 {
     Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
     Image out = CONVERT_TO_IMAGE_STRUCT(out);
-    vstore4(read_texels4(&in, convert_int8(clamp_to_border(apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height))), 0, out.ptr);
+    vstore4(read_texels4(&in, convert_int8_rtn(clamp_to_border(apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height))), 0, out.ptr);
 }
 
 /** Performs a perspective transform on an image interpolating with the BILINEAR method. Input and output are single channel U8.
@@ -124,5 +124,5 @@
 {
     Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
     Image out = CONVERT_TO_IMAGE_STRUCT(out);
-    vstore4(bilinear_interpolate(&in, clamp_to_border(apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height), width, height), 0, out.ptr);
+    vstore4(bilinear_interpolate(&in, apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height), 0, out.ptr);
 }

diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index 83bbe6a..18202c1 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/FixedPoint.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -33,32 +34,98 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "support/ToolchainSupport.h"
+
+#include <cmath>
+
 using namespace arm_compute;
 
-void CLActivationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+CLActivationLayerKernel::CLActivationLayerKernel()
+    : _input(nullptr), _output(nullptr)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+}
 
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    if(output != nullptr)
+    {
+        // Output auto inizialitation if not yet initialized
+        auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+    const int          fixed_point_position              = input->info()->fixed_point_position();
+    float              a_const                           = act_info.a();
+    float              b_const                           = act_info.b();
+    if(is_data_type_fixed_point(input->info()->data_type()))
+    {
+        a_const = static_cast<int>(lround(a_const * (1 << fixed_point_position)));
+        b_const = static_cast<int>(lround(b_const * (1 << fixed_point_position)));
+    }
 
     // Set build options
     std::set<std::string> build_opts;
-    build_opts.insert(("-D" + string_from_activation_func(act_info.activation())));
-    build_opts.insert(("-D" + ((is_data_type_float(input->info()->data_type())) ? std::string("TYPE_FP") : std::string("TYPE_INT"))));
-    build_opts.insert(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    build_opts.insert(("-DA=" + val_to_string(act_info.a())));
-    build_opts.insert(("-DB=" + val_to_string(act_info.b())));
+    build_opts.emplace(("-DACT=" + lower_string(string_from_activation_func(act_info.activation()))));
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+    build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const)));
+    build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const)));
+    build_opts.emplace(output == nullptr ? "-DIN_PLACE" : "");
+    if(is_data_type_fixed_point(input->info()->data_type()))
+    {
+        build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(fixed_point_position)));
+    }
 
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("activation_layer", build_opts));
 
     // Make sure _kernel is initialized before calling the parent's configure
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    ICLSimple3DKernel::configure(input, output, num_elems_processed_per_iteration);
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    if(output != nullptr)
+    {
+        AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+        update_window_and_padding(win, input_access, output_access);
+        output_access.set_valid_region(win, input->info()->valid_region());
+    }
+    else
+    {
+        update_window_and_padding(win,
+                                  AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+    }
+
+    ICLKernel::configure(win);
+}
+
+void CLActivationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        if(_output != nullptr)
+        {
+            add_3D_tensor_argument(idx, _output, slice);
+        }
+        enqueue(queue, *this, slice);
+    }
+    while(collapsed.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
index aaa62d0..65422c2 100644
--- a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp

@@ -48,9 +48,37 @@
 
 void CLArithmeticAdditionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    // Auto initialize output if not initialized
+    {
+        set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+        if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+        {
+            set_format_if_unknown(*output->info(), Format::S16);
+        }
+        else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+        {
+            set_format_if_unknown(*output->info(), Format::F32);
+        }
+        else if(input1->info()->data_type() == DataType::F16 && input2->info()->data_type() == DataType::F16)
+        {
+            set_format_if_unknown(*output->info(), Format::F16);
+        }
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
+                             "Output can only be U8 if both inputs are U8");
+    if(is_data_type_fixed_point(input1->info()->data_type()) || is_data_type_fixed_point(input2->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type()))
+    {
+        // Check that all data types are the same and all fixed-point positions are the same
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
+    }
 
     _input1 = input1;
     _input2 = input2;
@@ -58,18 +86,16 @@
 
     const bool has_float_out = is_data_type_float(output->info()->data_type());
 
-    // Check for invalid combination
-    if(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8))
-    {
-        ARM_COMPUTE_ERROR("You called with the wrong data types.");
-    }
-
     // Set kernel build options
     std::set<std::string> build_opts;
     build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
     build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
     build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
     build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+    if(is_data_type_fixed_point(input1->info()->data_type()))
+    {
+        build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input1->info()->fixed_point_position()));
+    }
 
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_add", build_opts));

diff --git a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
index 4c84727..c5183af 100644
--- a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
+++ b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp

@@ -45,17 +45,32 @@
 
 void CLArithmeticSubtractionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
-    // Check for invalid combination
-    if(output->info()->data_type() == DataType::U8)
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    // Auto initialize output if not initialized
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+        set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+        if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+        {
+            set_format_if_unknown(*output->info(), Format::S16);
+        }
+        else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+        {
+            set_format_if_unknown(*output->info(), Format::F32);
+        }
     }
-    else
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
+                             "Output can only be U8 if both inputs are U8");
+    if(is_data_type_fixed_point(input1->info()->data_type()) || is_data_type_fixed_point(input2->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type()))
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+        // Check that all data types are the same and all fixed-point positions are the same
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
     }
 
     _input1 = input1;
@@ -70,6 +85,10 @@
     build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
     build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
     build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+    if(is_data_type_fixed_point(input1->info()->data_type()))
+    {
+        build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input1->info()->fixed_point_position()));
+    }
 
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_sub", build_opts));

diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index 309a153..18c0c97 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp

@@ -26,12 +26,15 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/FixedPoint.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "support/ToolchainSupport.h"
+
 using namespace arm_compute;
 
 CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel()
@@ -39,24 +42,10 @@
 {
 }
 
-void CLBatchNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
+void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
                                                 float epsilon)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mean, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(var, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(beta, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gamma, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != mean->info()->dimension(0));
-
-    // Set build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F32);
 
     _input   = input;
     _output  = output;
@@ -66,25 +55,56 @@
     _gamma   = gamma;
     _epsilon = epsilon;
 
+    if(output != nullptr)
+    {
+        // Output tensor auto initialization if not yet initialized
+        auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, mean, var, beta, gamma);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, mean, var, beta, gamma);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var, beta, gamma);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, mean, var, beta, gamma);
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var, beta, gamma);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != mean->info()->dimension(0));
+
+    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+    build_opts.emplace(output == nullptr ? "-DIN_PLACE" : "");
+    if(is_data_type_fixed_point(input->info()->data_type()))
+    {
+        build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+    }
+
     // Create kernel
-    std::string kernel_name = "batchnormalization_layer";
-    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("batchnormalization_layer", build_opts));
 
     // Set kernel static arguments
     unsigned int idx = 2 * num_arguments_per_3D_tensor() + 4 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
     _kernel.setArg<cl_float>(idx++, _epsilon);
 
     // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration = 4;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, input->info()->valid_region());
-
+    if(output != nullptr)
+    {
+        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+        update_window_and_padding(win, input_access, output_access);
+        output_access.set_valid_region(win, input->info()->valid_region());
+    }
+    else
+    {
+        update_window_and_padding(win, input_access);
+    }
     ICLKernel::configure(win);
 }
 
@@ -108,7 +128,10 @@
     {
         idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _output, slice);
+        if(_output != nullptr)
+        {
+            add_3D_tensor_argument(idx, _output, slice);
+        }
         enqueue(queue, *this, slice);
     }
     while(window.slide_window_slice_3D(slice));

diff --git a/src/core/CL/kernels/CLBox3x3Kernel.cpp b/src/core/CL/kernels/CLBox3x3Kernel.cpp
index e113d30..0299f62 100644
--- a/src/core/CL/kernels/CLBox3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLBox3x3Kernel.cpp

@@ -37,7 +37,7 @@
 
 BorderSize CLBox3x3Kernel::border_size() const
 {
-    return 1;
+    return BorderSize(1);
 }
 
 void CLBox3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)

diff --git a/src/core/CL/kernels/CLChannelExtractKernel.cpp b/src/core/CL/kernels/CLChannelExtractKernel.cpp
index 5411533..be046cf 100644
--- a/src/core/CL/kernels/CLChannelExtractKernel.cpp
+++ b/src/core/CL/kernels/CLChannelExtractKernel.cpp

@@ -71,12 +71,12 @@
     // Configure window
     Window                 win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration));
     AccessWindowHorizontal input_access(input->info(), 0, _num_elems_processed_per_iteration);
-    AccessWindowRectangle  output_access(input->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _subsampling, 1.f / _subsampling);
+    AccessWindowRectangle  output_access(output->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _subsampling, 1.f / _subsampling);
 
     update_window_and_padding(win, input_access, output_access);
 
     ValidRegion input_valid_region = input->info()->valid_region();
-    output_access.set_valid_region(win, ValidRegion(std::move(input_valid_region.anchor), output->info()->tensor_shape()));
+    output_access.set_valid_region(win, ValidRegion(input_valid_region.anchor, output->info()->tensor_shape()));
 
     ICLKernel::configure(win);
 }
@@ -115,11 +115,10 @@
 
     // Configure window
     Window                 win = calculate_max_window(*input_plane->info(), Steps(_num_elems_processed_per_iteration));
-    AccessWindowHorizontal output_access(input_plane->info(), 0, _num_elems_processed_per_iteration);
+    AccessWindowHorizontal input_access(input_plane->info(), 0, _num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
 
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input_plane->info(), 0, _num_elems_processed_per_iteration),
-                              output_access);
+    update_window_and_padding(win, input_access, output_access);
 
     output_access.set_valid_region(win, input_plane->info()->valid_region());
 

diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
index ad66c39..c7884e3 100644
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp

@@ -43,9 +43,20 @@
 
 void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    TensorShape output_shape = input->info()->tensor_shape();
+    output_shape.set(0, convolved_dims.first);
+    output_shape.set(1, convolved_dims.second);
+    output_shape.set(2, input->info()->tensor_shape()[0]);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
 
     _input          = input;
     _output         = output;
@@ -53,16 +64,22 @@
 
     // Create kernel
     std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
-    _kernel                          = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("col2im", build_opts));
+    build_opts.emplace("-DWIDTH_OUTPUT=" + support::cpp11::to_string(_convolved_dims.first));
+    if(is_data_type_fixed_point(input->info()->data_type()))
+    {
+        build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+    }
 
-    // Set static kernel arguments
-    unsigned int idx = num_arguments_per_2D_tensor() + num_arguments_per_3D_tensor();
-    _kernel.setArg<cl_uint>(idx++, _convolved_dims.first);
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("col2im", build_opts));
 
     // Configure window
     Window win = calculate_max_window(*input->info(), Steps());
+
     // The CLCol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
     ICLKernel::configure(win);
 }
 
@@ -70,16 +87,23 @@
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+    // The collapse method rely on the assumption that the third dimension of input buffer is 1
+    ARM_COMPUTE_ERROR_ON(window.z().end() != 1);
 
-    Window slice_in  = window.first_slice_window_2D();
-    Window slice_out = window.first_slice_window_3D();
+    Window collapsed_window = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice            = collapsed_window.first_slice_window_3D();
+
+    // Set static kernel arguments
+    unsigned int idx = 2 * num_arguments_per_3D_tensor();
+    _kernel.setArg<cl_uint>(idx++, _output->info()->strides_in_bytes()[3]);
+
     do
     {
         // Set inputs
         unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice_in);
-        add_3D_tensor_argument(idx, _output, slice_out);
-        enqueue(queue, *this, slice_in);
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
     }
-    while(window.slide_window_slice_2D(slice_in) && window.slide_window_slice_3D(slice_out));
+    while(collapsed_window.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLConvolutionKernel.cpp b/src/core/CL/kernels/CLConvolutionKernel.cpp
index bdfe398..fd64dc4 100644
--- a/src/core/CL/kernels/CLConvolutionKernel.cpp
+++ b/src/core/CL/kernels/CLConvolutionKernel.cpp

@@ -79,7 +79,7 @@
         options.insert(mat_str.str());
     }
 
-    options.insert("-DSCALE=" + val_to_string(scale));
+    options.insert("-DSCALE=" + support::cpp11::to_string(scale));
 
     DataType data_type = data_type_for_convolution_matrix(conv, matrix_size * matrix_size);
     options.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
@@ -143,7 +143,7 @@
 
     for(unsigned int j = 0; j < matrix_size * matrix_size; j++)
     {
-        build_opts.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j]));
+        build_opts.insert("-DMAT" + support::cpp11::to_string(j) + "=" + support::cpp11::to_string(mat[j]));
     }
 
     build_opts.insert("-DSCALE=0");
@@ -151,7 +151,7 @@
     build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_separable1x" + val_to_string(matrix_size) + "_static", build_opts));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_separable1x" + support::cpp11::to_string(matrix_size) + "_static", build_opts));
 
     // Configure kernel window
     constexpr unsigned int num_elems_processed_per_iteration = 8;
@@ -195,10 +195,10 @@
 
     for(unsigned int j = 0; j < matrix_size * matrix_size; j++)
     {
-        build_opts.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j]));
+        build_opts.insert("-DMAT" + support::cpp11::to_string(j) + "=" + support::cpp11::to_string(mat[j]));
     }
 
-    build_opts.insert("-DSCALE=" + val_to_string(scale));
+    build_opts.insert("-DSCALE=" + support::cpp11::to_string(scale));
 
     build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
 
@@ -209,7 +209,7 @@
     build_opts.insert(out_type.str());
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_separable" + val_to_string(matrix_size) + "x1_static", build_opts));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_separable" + support::cpp11::to_string(matrix_size) + "x1_static", build_opts));
 
     // Configure kernel window
     constexpr unsigned int num_elems_processed_per_iteration = 8;
@@ -270,16 +270,16 @@
 
     for(unsigned int j = 0; j < MAX_MATRIX_SIZE; j++)
     {
-        options.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j]));
+        options.insert("-DMAT" + support::cpp11::to_string(j) + "=" + support::cpp11::to_string(mat[j]));
     }
 
-    options.insert("-DSCALE=" + val_to_string(scale));
+    options.insert("-DSCALE=" + support::cpp11::to_string(scale));
 
     DataType data_type = data_type_for_convolution_matrix(conv, matrix_size);
     options.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
 
-    options.insert("-DMATRIX_WIDTH=" + val_to_string(width));
-    options.insert("-DMATRIX_HEIGHT=" + val_to_string(height));
+    options.insert("-DMATRIX_WIDTH=" + support::cpp11::to_string(width));
+    options.insert("-DMATRIX_HEIGHT=" + support::cpp11::to_string(height));
 
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_rectangle", options));
 

diff --git a/src/core/CL/kernels/CLDepthConcatenateKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp
index 73f1ba1..edfbf82 100644
--- a/src/core/CL/kernels/CLDepthConcatenateKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp

@@ -35,10 +35,14 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "support/ToolchainSupport.h"
+
+#include <map>
+
 using namespace arm_compute;
 
 CLDepthConcatenateKernel::CLDepthConcatenateKernel()
-    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0)
+    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
 {
 }
 
@@ -49,49 +53,58 @@
 
 void CLDepthConcatenateKernel::configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    static std::map<int, std::pair<std::string, int>> configs_map =
+    {
+        { 1, { "uchar", 16 } },
+        { 2, { "ushort", 8 } },
+        { 4, { "uint", 4 } },
+        { 8, { "ulong", 2 } },
+    };
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
     ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
+    ARM_COMPUTE_ERROR_ON(configs_map.find(input->info()->element_size()) == configs_map.end());
 
     // The gaps between the two lowest dimensions of input and output need to be divisible by 2
     // Otherwise it is not clear how the padding should be added onto the input tensor
     ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
     ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
 
-    _input  = input;
-    _output = output;
+    _input        = input;
+    _output       = output;
+    _depth_offset = depth_offset;
+
+    // Add build options
+    auto                  config = configs_map.find(static_cast<int>(input->info()->element_size()));
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + config->second.first));
+    build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(config->second.second)));
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_depth"));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_depth", build_opts));
 
     // Configure kernel window
     _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
     _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
 
-    const unsigned int offset_to_first_elements_in_bytes = depth_offset * output->info()->strides_in_bytes()[2] + _left_right * output->info()->strides_in_bytes()[0] + _top_bottom *
-                                                           output->info()->strides_in_bytes()[1];
-
-    const unsigned int num_elems_processed_per_iteration = 4;
-    const unsigned int num_elems_read_per_iteration      = 4;
+    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+    const unsigned int num_elems_read_per_iteration      = 16 / input->info()->element_size();
     const unsigned int num_rows_read_per_iteration       = 1;
 
     // The window needs to be based on input as we copy all the depths of input
-    Window win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_processed_per_iteration), border_size());
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1));
 
+    AccessWindowRectangle  input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              output_access);
-
+    update_window_and_padding(win, input_access, output_access);
     output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
 
-    unsigned int idx = 2 * num_arguments_per_2D_tensor(); // Skip the input and output parameters
-    _kernel.setArg<unsigned int>(idx, offset_to_first_elements_in_bytes);
-
     ICLKernel::configure(win);
 }
 
@@ -100,14 +113,27 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    Window slice = window.first_slice_window_2D();
+    Window slice = window.first_slice_window_3D();
+
+    const int offset_to_first_elements_in_bytes = _depth_offset * _output->info()->strides_in_bytes()[2];
+
+    unsigned int  idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+    const cl_int3 offsets =
+    {
+        {
+            static_cast<cl_int>(_left_right),
+            static_cast<cl_int>(_top_bottom),
+            static_cast<cl_int>(offset_to_first_elements_in_bytes),
+        }
+    };
+    _kernel.setArg<cl_int3>(idx, offsets);
 
     do
     {
         unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, slice);
-        add_2D_tensor_argument(idx, _output, slice);
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice);
     }
-    while(window.slide_window_slice_2D(slice));
+    while(window.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLDepthConvertKernel.cpp b/src/core/CL/kernels/CLDepthConvertKernel.cpp
index 24608bd..c43884a 100644
--- a/src/core/CL/kernels/CLDepthConvertKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConvertKernel.cpp

@@ -40,13 +40,21 @@
 
 void CLDepthConvertKernel::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::U8, DataType::S16, DataType::QS16,
+                                                  DataType::U16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::S16, DataType::QS16,
+                                                  DataType::U16, DataType::U32, DataType::S32, DataType::F32);
     ARM_COMPUTE_ERROR_ON(input == output);
     ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data types must be different");
     ARM_COMPUTE_ERROR_ON(shift >= 8);
 
     // Check if convertion is supported
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && output->info()->data_type() != DataType::F32,
+                             "Only data types supported [in] QS8 -> [out] F32");
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS16 && (output->info()->data_type() != DataType::F32),
+                             "Only data types supported [in] QS16 -> [out] F32");
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && ((output->info()->data_type() != DataType::QS8) && output->info()->data_type() != DataType::QS16),
+                             "Only data types supported [in] F32 -> [out] QS8, QS16");
     ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::U16 && output->info()->data_type() != DataType::S16
                                                                             && output->info()->data_type() != DataType::U32 && output->info()->data_type() != DataType::S32),
                              "Only data types supported [in] U8 -> [out] U16, S16, U32, S32");
@@ -67,6 +75,11 @@
                                                                              && output->info()->data_type() != DataType::S16),
                              "Only data types supported [in] S32 ->  [out] U8, U16, S16");
 
+    // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
     // Get data sizes
     const size_t input_size  = data_size_from_type(input->info()->data_type());
     const size_t output_size = data_size_from_type(output->info()->data_type());
@@ -83,8 +96,12 @@
     {
         kernel_name += "_up";
     }
-    build_opts.insert("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.insert("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+    if(is_data_type_fixed_point(input->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type()))
+    {
+        build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+    }
 
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));

diff --git a/src/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.cpp
new file mode 100644
index 0000000..6e56835
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.cpp

@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+
+using namespace arm_compute;
+
+CLDepthwiseConvolution3x3Kernel::CLDepthwiseConvolution3x3Kernel()
+    : _border_size(0), _input(), _output(), _weights(), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_x(0), _conv_pad_y(0)
+{
+}
+
+BorderSize CLDepthwiseConvolution3x3Kernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLDepthwiseConvolution3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weights, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 3 || weights->info()->dimension(1) != 3);
+
+    std::pair<unsigned int, unsigned int> expected_output = scaled_dimensions(input->info()->tensor_shape().x(), input->info()->tensor_shape().y(),
+                                                                              weights->info()->tensor_shape().x(), weights->info()->tensor_shape().y(),
+                                                                              conv_info);
+
+    ARM_COMPUTE_UNUSED(expected_output);
+    ARM_COMPUTE_ERROR_ON(expected_output.first != output->info()->tensor_shape().x());
+    ARM_COMPUTE_ERROR_ON(expected_output.second != output->info()->tensor_shape().y());
+
+    _input         = input;
+    _output        = output;
+    _weights       = weights;
+    _conv_stride_x = conv_info.stride().first;
+    _conv_stride_y = conv_info.stride().second;
+    _conv_pad_x    = conv_info.pad().first;
+    _conv_pad_y    = conv_info.pad().second;
+    _border_size   = BorderSize(_conv_pad_y, _conv_pad_x);
+
+    // Set build options
+    ARM_COMPUTE_ERROR_ON(_conv_stride_x < 1 || _conv_stride_x > 3);
+    std::set<std::string> options{ "-DCONV_STRIDE_X=" + support::cpp11::to_string(_conv_stride_x) };
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_convolution_3x3", options));
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = 2;
+    const unsigned int num_elems_written_per_iteration   = 2;
+    const unsigned int num_elems_read_per_iteration      = 3 + _conv_stride_x;
+    const unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration, _conv_stride_x, _conv_stride_y);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+    AccessWindowStatic     weights_access(weights->info(), 0, 0, weights->info()->dimension(0), weights->info()->dimension(1));
+
+    update_window_and_padding(win, input_access, weights_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLDepthwiseConvolution3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice_in      = window.first_slice_window_3D();
+    Window slice_out     = window.first_slice_window_3D();
+    Window slice_weights = window.first_slice_window_3D();
+
+    slice_in.adjust(Window::DimX, -_conv_pad_x, true);
+    slice_in.adjust(Window::DimY, -_conv_pad_y, true);
+    slice_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);
+    slice_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);
+    slice_weights.set_dimension_step(Window::DimX, 0);
+    slice_weights.set_dimension_step(Window::DimY, 0);
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice_in);
+        add_3D_tensor_argument(idx, _output, slice_out);
+        add_3D_tensor_argument(idx, _weights, slice_weights);
+
+        enqueue(queue, *this, slice_out);
+    }
+    while(window.slide_window_slice_3D(slice_out));
+}

diff --git a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
new file mode 100644
index 0000000..0eaadb8
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp

@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+#include <tuple>
+
+using namespace arm_compute;
+
+CLDepthwiseIm2ColKernel::CLDepthwiseIm2ColKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLDepthwiseIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (kernel_dims.width * kernel_dims.height));
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    std::set<std::string> build_opts;
+
+    build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.emplace("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
+    build_opts.emplace("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
+    build_opts.emplace("-DPAD_X=" + support::cpp11::to_string(conv_info.pad().first));
+    build_opts.emplace("-DPAD_Y=" + support::cpp11::to_string(conv_info.pad().second));
+    build_opts.emplace("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+    build_opts.emplace("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+    build_opts.emplace("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
+    build_opts.emplace("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_im2col", build_opts));
+
+    // Configure  kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+    // The CLDepthwiseIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLDepthwiseIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window slice    = window.first_slice_window_3D();
+    Window slice_in = window.first_slice_window_3D();
+
+    // Setup slice
+    slice.set(Window::DimX, Window::Dimension(0, _output->info()->dimension(0), _output->info()->dimension(0)));
+    slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 1));
+    slice.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(2), 1));
+
+    // Setup input slice
+    // The first three dimensions of the input are increased by the inner loops
+    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice_in);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
+}

diff --git a/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp b/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
new file mode 100644
index 0000000..2086b1d
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp

@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthwiseVectorToTensorKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLDepthwiseVectorToTensorKernel::CLDepthwiseVectorToTensorKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLDepthwiseVectorToTensorKernel::configure(const ICLTensor *input, ICLTensor *output, size_t conv_w, size_t conv_h)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    std::set<std::string> build_opts;
+    build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.emplace("-DCONV_WIDTH=" + support::cpp11::to_string(conv_w));
+    build_opts.emplace("-DCONV_HEIGHT=" + support::cpp11::to_string(conv_h));
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_vector_to_tensor", build_opts));
+
+    // Configure  kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+    // The CLDepthwisevectorToTensorKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLDepthwiseVectorToTensorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window slice     = window.first_slice_window_1D();
+    Window slice_out = window.first_slice_window_3D();
+
+    // Setup slice
+    slice.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), 1));
+
+    // Setup output slice
+    // The first three dimensions of the output are increased by the inner loops
+    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_1D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_1D(slice) && window.slide_window_slice_3D(slice_out));
+}

diff --git a/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp
new file mode 100644
index 0000000..68de68b
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp

@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLDepthwiseWeightsReshapeKernel::CLDepthwiseWeightsReshapeKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLDepthwiseWeightsReshapeKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(0) * input->info()->dimension(1));
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    std::set<std::string> build_opts;
+
+    build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.emplace("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_weights_reshape", build_opts));
+
+    // Configure  kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+    // The CLDepthwiseWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLDepthwiseWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window slice     = window.first_slice_window_3D();
+    Window slice_out = window.first_slice_window_2D();
+
+    // Setup slice
+    slice.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+    slice.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), 1));
+    slice.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), 1));
+
+    // Setup output slice
+    // The first two dimensions of the output are increased by the inner loops
+    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_2D(slice_out));
+}

diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
new file mode 100644
index 0000000..216fa27
--- /dev/null
+++ b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp

@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLDequantizationLayerKernel::CLDequantizationLayerKernel()
+    : _input(nullptr), _output(nullptr), _min_max(nullptr)
+{
+}
+
+void CLDequantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *min_max)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output, min_max);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, DataType::F32, 0);
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+    _input   = input;
+    _output  = output;
+    _min_max = min_max;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("dequantization_layer"));
+
+    // Configure window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     min_max_access(min_max->info(), 0, 0, 2, min_max->info()->dimension(1));
+
+    // Update window and padding
+    update_window_and_padding(win, input_access, output_access, min_max_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLDequantizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
+    Window slice            = window_collapsed.first_slice_window_3D();
+
+    Window min_max_window = window;
+    min_max_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+    min_max_window.set(Window::DimY, Window::Dimension(0, _min_max->info()->dimension(1), 1));
+    min_max_window.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    Window min_max_slice = min_max_window.first_slice_window_1D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        add_1D_tensor_argument(idx, _min_max, min_max_slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice) && min_max_window.slide_window_slice_1D(min_max_slice));
+}

diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
new file mode 100644
index 0000000..4224d9b
--- /dev/null
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp

@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLDirectConvolutionLayerKernel::CLDirectConvolutionLayerKernel()
+    : _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _border_size(0), _conv_pad_x(0), _conv_pad_y(0), _conv_stride_x(0), _conv_stride_y(0)
+{
+}
+
+BorderSize CLDirectConvolutionLayerKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) != weights->info()->dimension(1),
+                             "Weights should have same width as length");
+    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) != 1 && weights->info()->dimension(0) != 3 && weights->info()->dimension(0) != 5,
+                             "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported");
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+    ARM_COMPUTE_ERROR_ON_MSG((weights->info()->dimension(0) == 1) && std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported for 1x1 convolution.");
+    ARM_COMPUTE_ERROR_ON_MSG((weights->info()->dimension(0) == 3 || weights->info()->dimension(0) == 5) && std::get<0>(conv_info.stride()) > 2, "Strides larger than 2 not supported for 3x3 convolution.");
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
+    const unsigned int kernel_size = weights->info()->dimension(0);
+
+    // Get convolved dimensions
+    unsigned int output_width  = 0;
+    unsigned int output_height = 0;
+    std::tie(output_width, output_height) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_size, kernel_size, conv_info);
+
+    TensorShape output_shape = input->info()->tensor_shape();
+    output_shape.set(0, output_width);
+    output_shape.set(1, output_height);
+    output_shape.set(2, weights->info()->dimension(3));
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+    _conv_stride_x = std::get<0>(conv_info.stride());
+    _conv_stride_y = std::get<1>(conv_info.stride());
+    _conv_pad_x    = std::min(std::get<0>(conv_info.pad()), kernel_size / 2);
+    _conv_pad_y    = std::min(std::get<1>(conv_info.pad()), kernel_size / 2);
+
+    _input       = input;
+    _weights     = weights;
+    _output      = output;
+    _biases      = biases;
+    _border_size = BorderSize(_conv_pad_y, _conv_pad_x);
+
+    std::set<std::string> options;
+
+    const GPUTarget gpu_target = get_arch_from_target(get_target());
+
+    if(_biases != nullptr)
+    {
+        options.emplace("-DHAS_BIAS");
+    }
+
+    if((gpu_target == GPUTarget::BIFROST) && (kernel_size <= 5) && (_conv_stride_x == 1) && (_conv_stride_y == 1) && (input->info()->data_type() == DataType::F32))
+    {
+        options.emplace("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2)));
+
+        std::string kernel_name = "direct_convolution" + support::cpp11::to_string(kernel_size) + "x" + support::cpp11::to_string(kernel_size) + "_f32_bifrost";
+        _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, options));
+
+        // Configure kernel window
+        Window win = calculate_max_window(*output->info());
+
+        unsigned int num_elems_read_per_iteration_x    = 0;
+        unsigned int num_elems_read_per_iteration_y    = 0;
+        unsigned int num_elems_written_per_iteration_x = 0;
+        unsigned int num_elems_written_per_iteration_y = 0;
+
+        switch(kernel_size)
+        {
+            case 1:
+            {
+                num_elems_read_per_iteration_x    = 4;
+                num_elems_read_per_iteration_y    = 4;
+                num_elems_written_per_iteration_x = 4;
+                num_elems_written_per_iteration_y = 4;
+                break;
+            }
+            case 3:
+            {
+                num_elems_read_per_iteration_x    = 6;
+                num_elems_read_per_iteration_y    = 5;
+                num_elems_written_per_iteration_x = 4;
+                num_elems_written_per_iteration_y = 3;
+                break;
+            }
+            case 5:
+            {
+                num_elems_read_per_iteration_x    = 8;
+                num_elems_read_per_iteration_y    = 6;
+                num_elems_written_per_iteration_x = 4;
+                num_elems_written_per_iteration_y = 2;
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");
+            }
+        }
+
+        // Calculate right and bottom border
+        const int input_width  = input->info()->dimension(0) - kernel_size / 2 + _conv_pad_x;
+        const int input_height = input->info()->dimension(1) - kernel_size / 2 + _conv_pad_y;
+
+        // Create window and update padding
+        win = calculate_max_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
+
+        AccessWindowStatic    input_access(input->info(), -_conv_pad_x, -_conv_pad_y, input_width + num_elems_read_per_iteration_x, input_height + num_elems_read_per_iteration_y);
+        AccessWindowStatic    weights_access(weights->info(), 0, 0, kernel_size, kernel_size);
+        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
+
+        update_window_and_padding(win, input_access, weights_access, output_access);
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+        ICLKernel::configure(win);
+    }
+    else
+    {
+        std::stringstream kernel_name;
+        kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
+        DataType promoted_type = input->info()->data_type();
+
+        options.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+        options.emplace("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type()));
+        options.emplace("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2)));
+        options.emplace("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
+
+        if(is_data_type_fixed_point(input->info()->data_type()))
+        {
+            options.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+
+            switch(input->info()->data_type())
+            {
+                case DataType::QS8:
+                    promoted_type = DataType::QS16;
+                    break;
+                case DataType::QS16:
+                    promoted_type = DataType::QS32;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Datatype not supported");
+            }
+        }
+
+        options.emplace("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(promoted_type));
+
+        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str(), options));
+
+        // Configure kernel window
+
+        bool is_stride2 = ((kernel_size != 1) && (_conv_stride_x == 2));
+
+        const unsigned int num_elems_read_per_iteration_x    = 8 + 2 * (kernel_size / 2) + (is_stride2 ? 6 + kernel_size / 2 : 0);
+        const unsigned int num_elems_read_per_iteration_y    = kernel_size;
+        const unsigned int num_elems_written_per_iteration_x = 8;
+        const unsigned int num_elems_written_per_iteration_y = 1;
+
+        // Calculate right and bottom border
+        const int input_width  = input->info()->dimension(0) - kernel_size / 2 + _conv_pad_x;
+        const int input_height = input->info()->dimension(1) - kernel_size / 2 + _conv_pad_y;
+
+        // Create window and update padding
+        Window win = calculate_max_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
+
+        AccessWindowStatic    input_access(input->info(), -_conv_pad_x, -_conv_pad_y, input_width + num_elems_read_per_iteration_x, input_height + num_elems_read_per_iteration_y);
+        AccessWindowStatic    weights_access(weights->info(), 0, 0, kernel_size, kernel_size);
+        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
+
+        update_window_and_padding(win, input_access, weights_access, output_access);
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+        ICLKernel::configure(win);
+    }
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "direct_convolution_";
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(kernel_size);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(_conv_pad_x);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(_conv_pad_y);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(_conv_stride_x);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(_conv_stride_y);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
+}
+
+void CLDirectConvolutionLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    // Get initial windows
+    Window slice  = window.first_slice_window_3D();
+    Window win_in = window;
+
+    win_in.adjust(Window::DimX, -_conv_pad_x, true);
+    win_in.adjust(Window::DimY, -_conv_pad_y, true);
+    win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);
+    win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);
+
+    Window slice_in = win_in.first_slice_window_3D();
+
+    unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
+    add_3D_tensor_argument(idx1, _weights, slice);
+
+    if(_biases != nullptr)
+    {
+        Window slice_biases;
+        slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
+        add_1D_tensor_argument(idx1, _biases, slice_biases);
+    }
+
+    _kernel.setArg(idx1++, static_cast<unsigned int>(_weights->info()->strides_in_bytes()[3]));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice_in);
+        add_3D_tensor_argument(idx, _output, slice);
+
+        enqueue(queue, *this, slice, _lws_hint);
+    }
+    while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in));
+}

diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index 981aad6..2e066c7 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp

@@ -29,6 +29,7 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
@@ -76,7 +77,7 @@
 
     // Define select type required by replicate border > 1
     const DataType dt          = tensor->info()->data_type();
-    std::string    select_type = get_cl_type_from_data_type(dt);
+    std::string    select_type = get_underlying_cl_type_from_data_type(dt);
     if(is_data_type_float(dt))
     {
         select_type = (DataType::F32 == dt) ? "int" : "short";
@@ -84,12 +85,16 @@
 
     // Define build options
     std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
+    build_opts.emplace(("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(dt)));
     build_opts.emplace(("-DSELECT_TYPE=" + select_type));
-    build_opts.emplace(("-DBORDER_SIZE_TOP=" + val_to_string(border_size.top)));
-    build_opts.emplace(("-DBORDER_SIZE_BOTTOM=" + val_to_string(border_size.bottom)));
-    build_opts.emplace(("-DBORDER_SIZE_LEFT=" + val_to_string(border_size.left)));
-    build_opts.emplace(("-DBORDER_SIZE_RIGHT=" + val_to_string(border_size.right)));
+    build_opts.emplace(("-DBORDER_SIZE_TOP=" + support::cpp11::to_string(border_size.top)));
+    build_opts.emplace(("-DBORDER_SIZE_BOTTOM=" + support::cpp11::to_string(border_size.bottom)));
+    build_opts.emplace(("-DBORDER_SIZE_LEFT=" + support::cpp11::to_string(border_size.left)));
+    build_opts.emplace(("-DBORDER_SIZE_RIGHT=" + support::cpp11::to_string(border_size.right)));
+    if(is_data_type_fixed_point(tensor->info()->data_type()))
+    {
+        build_opts.emplace("-DFIXED_POINT_POSITION");
+    }
 
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
@@ -108,7 +113,7 @@
     const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
 
     // Set static kernel arguments
-    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the tensor parameters
+    unsigned int idx = num_arguments_per_3D_tensor(); //Skip the tensor parameters
     ICLKernel::add_argument<cl_uint>(idx, valid_width);
     ICLKernel::add_argument<cl_uint>(idx, valid_height);
     ICLKernel::add_argument<cl_int2>(idx, valid_region_coords);
@@ -119,9 +124,14 @@
             case DataType::U8:
                 set_constant_border<uint8_t>(idx, constant_border_value);
                 break;
+            case DataType::QS8:
+            case DataType::S8:
+                set_constant_border<int8_t>(idx, constant_border_value);
+                break;
             case DataType::U16:
                 set_constant_border<uint16_t>(idx, constant_border_value);
                 break;
+            case DataType::QS16:
             case DataType::S16:
                 set_constant_border<int16_t>(idx, constant_border_value);
                 break;
@@ -148,7 +158,7 @@
     Window win;
     win.set(Window::DimX, Window::Dimension(0, total_valid_width + valid_height));
     win.set(Window::DimY, Window::Dimension(0, 1, 1));
-    win.use_tensor_dimensions(tensor->info(), Window::DimZ);
+    win.use_tensor_dimensions(tensor->info()->tensor_shape(), Window::DimZ);
     ICLKernel::configure(win);
 }
 
@@ -163,13 +173,13 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
 
-    Window slice = window.first_slice_window_2D();
+    Window slice = window.first_slice_window_3D();
 
     do
     {
         unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _tensor, slice);
+        add_3D_tensor_argument(idx, _tensor, slice);
         enqueue(queue, *this, slice, cl::NullRange);
     }
-    while(window.slide_window_slice_2D(slice));
+    while(window.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLFloorKernel.cpp b/src/core/CL/kernels/CLFloorKernel.cpp
new file mode 100644
index 0000000..6c9f83f
--- /dev/null
+++ b/src/core/CL/kernels/CLFloorKernel.cpp

@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLFloorKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLFloorKernel::CLFloorKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLFloorKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    set_data_type_if_unknown(*input->info(), DataType::F32);
+    set_data_type_if_unknown(*output->info(), DataType::F32);
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+    // Create kernel
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("floor_layer", build_opts));
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLFloorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}

diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
index 71d42c5..268260b 100644
--- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp

@@ -43,18 +43,27 @@
 
 void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16,
+                                                  DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    TensorShape output_shape = input->info()->tensor_shape();
+    output_shape.set(0, input->info()->dimension(0) * 4);
+    output_shape.set(1, std::ceil(input->info()->dimension(1) / 4.0f));
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(0) * 4);
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != std::ceil(static_cast<float>(input->info()->dimension(1)) / 4.0f));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
 
     _input  = input;
     _output = output;
 
     // Create kernel
     std::string data_type_name;
-    data_type_name = val_to_string(input->info()->element_size() * 8) + "bit";
+    data_type_name = support::cpp11::to_string(input->info()->element_size() * 8) + "bit";
     _kernel        = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_interleave4x4_" + data_type_name));
 
     // Configure kernel window
@@ -72,6 +81,14 @@
     output_access.set_valid_region(win, input->info()->valid_region());
 
     ICLKernel::configure(win);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "interleave4x4_";
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
 }
 
 void CLGEMMInterleave4x4Kernel::run(const Window &window, cl::CommandQueue &queue)

diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
index c6e05b9..ef572cf 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp

@@ -33,6 +33,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -63,8 +64,8 @@
     _output = output;
 
     // Create kernel and set static arguments
-    std::set<std::string> build_opts = { ("-DWIDTH_MATRIX_B=" + val_to_string(input1->info()->dimension(0))) };
-    _kernel                          = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_u8", build_opts));
+    std::set<std::string> build_opts = { ("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0))) };
+    _kernel                          = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_interleaved_transposed_u8", build_opts));
     unsigned int idx                 = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
     _kernel.setArg<int32_t>(idx++, a_offset);
     _kernel.setArg<int32_t>(idx++, b_offset);

diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
index 289873c..263cfab 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp

@@ -43,24 +43,30 @@
 
 void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(biases, accum);
     ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1);
 
     _biases = biases;
     _accum  = accum;
 
+    std::set<std::string> build_opts;
+    build_opts.insert(("-DDATA_TYPE=" + get_cl_type_from_data_type(accum->info()->data_type())));
+    if(is_data_type_fixed_point(accum->info()->data_type()))
+    {
+        build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(accum->info()->fixed_point_position()));
+    }
+
     // Create kernel
-    std::string data_type_name = lower_string(string_from_data_type(accum->info()->data_type()));
-    _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_accumulate_biases_" + data_type_name));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_accumulate_biases", build_opts));
 
     // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(accum->info()->data_type());
+    const unsigned int num_elems_processed_per_iteration = 16;
 
     Window win = calculate_max_window(*_accum->info(), Steps(num_elems_processed_per_iteration));
 
-    AccessWindowStatic     biases_access(biases->info(), 0, 0, biases->info()->dimension(0), biases->info()->dimension(1));
+    AccessWindowStatic     biases_access(biases->info(), 0, 0, ceil_to_multiple(biases->info()->dimension(0), num_elems_processed_per_iteration), biases->info()->dimension(1));
     AccessWindowHorizontal accum_access(_accum->info(), 0, num_elems_processed_per_iteration);
 
     update_window_and_padding(win, biases_access, accum_access);

diff --git a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
index 343838f..1499df0 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp

@@ -28,6 +28,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/FixedPoint.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
@@ -40,10 +41,9 @@
 {
 }
 
-void CLGEMMMatrixAdditionKernel::configure(const ICLTensor *input, ICLTensor *output, const float beta)
+void CLGEMMMatrixAdditionKernel::configure(const ICLTensor *input, ICLTensor *output, float beta)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
@@ -53,7 +53,19 @@
     const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(input->info()->data_type());
 
     std::ostringstream ma_arguments;
-    ma_arguments << "-DBETA=" << beta;
+    if(is_data_type_fixed_point(input->info()->data_type()))
+    {
+        ma_arguments << "-DBETA=" << (input->info()->data_type() == DataType::QS8 ?
+                                      sqcvt_qs8_f32(beta, input->info()->fixed_point_position()) :
+                                      sqcvt_qs16_f32(beta, input->info()->fixed_point_position()))
+                     << " ";
+        ma_arguments << "-DFIXED_POINT_POSITION=" << input->info()->fixed_point_position();
+    }
+    else
+    {
+        ma_arguments << "-DBETA=" << beta;
+    }
+
     std::set<std::string> build_opts;
     build_opts.emplace(ma_arguments.str());
 

diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index d7388e8..b184c50 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp

@@ -25,12 +25,12 @@
 
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/AccessWindowTranspose.h"
-
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/FixedPoint.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
@@ -48,13 +48,13 @@
 {
 }
 
-void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha)
+void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
-    if(output->info()->dimension(1) == 1)
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+
+    if(!is_interleaved_transposed)
     {
         ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
     }
@@ -72,51 +72,36 @@
         _lws_hint = cl::NDRange(8, 8);
     }
 
-    std::ostringstream mm_arguments;
-    mm_arguments << "-DWIDTH_MATRIX_B=" << input1->info()->dimension(0) << " ";
-    mm_arguments << "-DALPHA=" << alpha << " ";
     std::set<std::string> build_opts;
+    build_opts.emplace(("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0))));
+    build_opts.emplace(("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0))));
 
-    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
-    if(output->info()->dimension(1) == 1)
+    if(is_data_type_fixed_point(input0->info()->data_type()))
     {
-        mm_arguments << "-DWIDTH_VECTOR_A=" << input0->info()->dimension(0) << " ";
-        build_opts.emplace(mm_arguments.str());
+        build_opts.emplace(("-DALPHA=" + support::cpp11::to_string((input0->info()->data_type() == DataType::QS8 ?
+                                                                    sqcvt_qs8_f32(alpha, input0->info()->fixed_point_position()) :
+                                                                    sqcvt_qs16_f32(alpha, input0->info()->fixed_point_position())))));
 
-        // Create kernel
-        std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
-        _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_vm_" + data_type_name), build_opts));
-
-        // Configure window kernel
-        const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
-
-        Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
-
-        AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
-        AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
-        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
-
-        update_window_and_padding(win, input0_access, input1_access, output_access);
-
-        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
-
-        ICLKernel::configure(win);
+        build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input0->info()->fixed_point_position())));
     }
     else
     {
-        build_opts.emplace(mm_arguments.str());
+        build_opts.emplace(("-DALPHA=" + float_to_string_with_full_precision(alpha)));
+    }
 
+    if(is_interleaved_transposed)
+    {
         // Create kernel
         std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
 
         if(data_type_name == "f32")
         {
             GPUTarget arch_target = get_arch_from_target(get_target());
-            _kernel               = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_f32_" + string_from_target(arch_target), build_opts));
+            _kernel               = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_interleaved_transposed_f32_" + string_from_target(arch_target), build_opts));
         }
         else
         {
-            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_" + data_type_name, build_opts));
+            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_interleaved_transposed_" + data_type_name, build_opts));
         }
 
         // Configure window kernel
@@ -135,6 +120,55 @@
 
         ICLKernel::configure(win);
     }
+    else // The input tensors have not been reshaped
+    {
+        ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+
+        // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor
+        const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
+        const unsigned int num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
+
+        build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type())));
+        build_opts.emplace(("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elems_processed_per_iteration_x)));
+        build_opts.emplace(("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elems_processed_per_iteration_y)));
+
+        // Create kernel
+        if(is_data_type_fixed_point(input0->info()->data_type()))
+        {
+            std::string kernel_name = "gemm_mm_" + lower_string(string_from_data_type(input0->info()->data_type()));
+            _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel((kernel_name), build_opts));
+        }
+        else
+        {
+            std::string kernel_name = "gemm_mm_floating_point";
+            _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel((kernel_name), build_opts));
+        }
+
+        Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+        AccessWindowStatic    input0_access(input0->info(), 0, 0, input0->info()->dimension(0), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y));
+        AccessWindowStatic    input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1));
+        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        update_window_and_padding(win, input0_access, input1_access, output_access);
+
+        Coordinates coord;
+        coord.set_num_dimensions(output->info()->num_dimensions());
+        output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
+
+        ICLKernel::configure(win);
+
+        // Set config_id for enabling LWS tuning
+        _config_id = "gemm_";
+        _config_id += (is_interleaved_transposed ? "reshaped_" : "");
+        _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
+        _config_id += "_";
+        _config_id += support::cpp11::to_string(output->info()->dimension(1));
+        _config_id += "_";
+        _config_id += support::cpp11::to_string(output->info()->dimension(0));
+        _config_id += "_";
+        _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
+    }
 }
 
 void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -144,9 +178,9 @@
 
     Window slice          = window.first_slice_window_2D();
     Window slice_matrix_b = slice;
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, _input1->info()->dimension(0), 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, _input1->info()->dimension(1), 1));
-    slice_matrix_b.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
     do
     {

diff --git a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
new file mode 100644
index 0000000..70af5d6
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp

@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+
+using namespace arm_compute;
+
+CLGEMMMatrixVectorMultiplyKernel::CLGEMMMatrixVectorMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr), _num_rows_read_per_iteration(0), _border_size(0)
+{
+}
+BorderSize CLGEMMMatrixVectorMultiplyKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLGEMMMatrixVectorMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON(input0->info()->dimension(2) != input1->info()->dimension(1));
+
+    _input0 = input0;
+    _input1 = input1;
+    _output = output;
+
+    // Create kernel
+    std::set<std::string> build_opts;
+
+    build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
+    build_opts.emplace("-DSRC_WIDTH=" + support::cpp11::to_string(input0->info()->dimension(0)));
+    build_opts.emplace("-DSRC_HEIGHT=" + support::cpp11::to_string(input0->info()->dimension(1)));
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mv", build_opts));
+
+    // Configure kernel window
+    const unsigned int num_elems_read_per_iteration = 4;
+
+    _num_rows_read_per_iteration = 4;
+
+    const unsigned int border_x = ceil_to_multiple(input0->info()->dimension(0), num_elems_read_per_iteration) - input0->info()->dimension(0);
+    const unsigned int border_y = ceil_to_multiple(input0->info()->dimension(1), _num_rows_read_per_iteration) - input0->info()->dimension(1);
+
+    _border_size = BorderSize(border_y, border_x);
+
+    Window win = calculate_max_window(*input0->info(), Steps(num_elems_read_per_iteration));
+
+    AccessWindowRectangle  input0_access(input0->info(), 0, 0, num_elems_read_per_iteration, _num_rows_read_per_iteration);
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_read_per_iteration);
+    AccessWindowStatic     output_access(_output->info(), 0, 0, _output->info()->dimension(0) + border_x, _output->info()->dimension(1) + border_y);
+
+    update_window_and_padding(win, input0_access, input1_access, output_access);
+
+    _output->info()->set_valid_region(ValidRegion(Coordinates(), _output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLGEMMMatrixVectorMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window slice_in  = window.first_slice_window_3D();
+    Window slice_in2 = window.first_slice_window_3D();
+    Window slice_out = window.first_slice_window_3D();
+
+    // Setup input0 slice
+    slice_in.set(Window::DimX, Window::Dimension(0, _input0->info()->dimension(0), _input0->info()->dimension(0)));
+    slice_in.set(Window::DimY, Window::Dimension(0, _input0->info()->dimension(1) + border_size().bottom, _num_rows_read_per_iteration));
+    slice_in.set(Window::DimZ, Window::Dimension(0, _input0->info()->dimension(2), 1));
+
+    // Setup input1 and output slice. Their dimensions are increased in the cl kernel.
+    slice_in2.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_in2.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_in2.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    unsigned int idx_1 = num_arguments_per_3D_tensor();
+
+    add_2D_tensor_argument(idx_1, _input1, slice_in2);
+
+    do
+    {
+        unsigned int idx_0 = 0;
+        unsigned int idx_2 = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
+        add_3D_tensor_argument(idx_0, _input0, slice_in);
+        add_1D_tensor_argument(idx_2, _output, slice_out);
+        enqueue(queue, *this, slice_in);
+    }
+    while(window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
+}

diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
index ecee1ab..5057c8f 100644
--- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp

@@ -40,8 +40,9 @@
 
 void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(output == nullptr);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16,
+                                                  DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
     TensorShape  output_shape{ input->info()->tensor_shape() };
     const size_t transpose_w = 16 / input->info()->element_size();
@@ -53,10 +54,13 @@
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
 
-    _input                                               = input;
-    _output                                              = output;
-    const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(input->info()->data_type());
+    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+    const int          scale_x                           = num_elems_processed_per_iteration;
+
+    _input  = input;
+    _output = output;
 
     /*
      * Following an example of how the transposition1xW works when the input data type is F32
@@ -66,41 +70,23 @@
      *         |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 |
      *         |a30 a31 a32 a33|
      *
-     * If the input data type is F32, the output matrix will have the following shape: [ height * 4, width / 4 ]
-     * If the input data type is F16, the output matrix will have the following shape: [ height * 8, width / 8 ]
+     * The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
      */
     // Create kernel
-    std::string data_type_name = lower_string(string_from_data_type(input->info()->data_type()));
-    std::string kernel_name    = "gemm_transpose1x" + val_to_string(num_elems_processed_per_iteration) + "_" + data_type_name;
-    _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+    std::string kernel_name = "gemm_transpose1x" + support::cpp11::to_string(num_elems_processed_per_iteration);
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
 
     // Configure window
     Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
 
-    float scale_x = 1.f;
-
-    switch(input->info()->data_type())
-    {
-        case DataType::U8:
-            scale_x = 16.f;
-            break;
-        case DataType::F16:
-            scale_x = 8.f;
-            break;
-        case DataType::F32:
-            scale_x = 4.f;
-            break;
-        default:
-            // Do nothing
-            break;
-    }
+    ARM_COMPUTE_ERROR_ON_MSG((win.x().end() / scale_x) == 0, "Transposed shape would be 0 in the second dimension");
 
     AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
     AccessWindowTranspose  output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
 
     update_window_and_padding(win, input_access, output_access);
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->info()->tensor_shape()));
 
     ICLKernel::configure(win);
 }

diff --git a/src/core/CL/kernels/CLHarrisCornersKernel.cpp b/src/core/CL/kernels/CLHarrisCornersKernel.cpp
index 9fc34a7..1f757fe 100644
--- a/src/core/CL/kernels/CLHarrisCornersKernel.cpp
+++ b/src/core/CL/kernels/CLHarrisCornersKernel.cpp

@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
 
+#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -91,8 +92,8 @@
     // Configure kernel window
     constexpr unsigned int num_elems_processed_per_iteration = 4;
     constexpr unsigned int num_elems_written_per_iteration   = 4;
-    constexpr unsigned int num_elems_read_per_iteration      = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
+    const unsigned int     num_elems_read_per_iteration      = block_size == 7 ? 10 : 8;
+    const unsigned int     num_rows_read_per_iteration       = block_size;
 
     Window win = calculate_max_window(*_input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
 

diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 8c0fe26..98a799f 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp

@@ -29,8 +29,10 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
 
 #include <cmath>
 #include <tuple>
@@ -38,14 +40,15 @@
 using namespace arm_compute;
 
 CLIm2ColKernel::CLIm2ColKernel()
-    : _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_size(0), _num_elems_processed_per_iteration(1), _run_func(nullptr)
+    : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _run_func(nullptr)
 {
 }
 
-void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims, const PadStrideInfo &conv_info, bool has_bias)
+void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
 
     _input  = input;
     _output = output;
@@ -55,6 +58,11 @@
     build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
     build_opts.emplace((has_bias ? "-DHAS_BIAS" : ""));
 
+    if(is_data_type_fixed_point(input->info()->data_type()))
+    {
+        build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+    }
+
     int pad_x    = 0;
     int pad_y    = 0;
     int stride_x = 0;
@@ -70,45 +78,31 @@
 
     if(!run_img2col_reduced)
     {
-        _convolved_dims                    = convolved_dims;
-        _conv_info                         = conv_info;
-        _kernel_size                       = std::sqrt((output->info()->dimension(0) - (has_bias ? 1 : 0)) / input->info()->dimension(2));
+        _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
+                                            kernel_dims.width, kernel_dims.height,
+                                            conv_info);
         _num_elems_processed_per_iteration = output->info()->dimension(0);
 
-        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("im2col_generic", build_opts));
+        build_opts.emplace("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
+        build_opts.emplace("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
+        build_opts.emplace("-DKERNEL_DEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+        build_opts.emplace("-DCONVOLVED_WIDTH=" + support::cpp11::to_string(_convolved_dims.first));
+        build_opts.emplace("-DCONVOLVED_HEIGHT=" + support::cpp11::to_string(_convolved_dims.second));
+        build_opts.emplace("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
+        build_opts.emplace("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
+        build_opts.emplace("-DPAD_X=" + support::cpp11::to_string(conv_info.pad().first));
+        build_opts.emplace("-DPAD_Y=" + support::cpp11::to_string(conv_info.pad().second));
+        build_opts.emplace("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+        build_opts.emplace("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
 
-        // Create static kernel arguments
-        const cl_int2 input_dims =
+        if(kernel_dims.width == 3 && kernel_dims.height == 3 && conv_info.pad().first == 0 && conv_info.pad().second == 0)
         {
-            {
-                static_cast<cl_int>(input->info()->dimension(0)),
-                static_cast<cl_int>(input->info()->dimension(1)),
-            }
-        };
-        const cl_int2 strides =
+            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("im2col_kernel3x3_padx0_pady0", build_opts));
+        }
+        else
         {
-            {
-                stride_x,
-                stride_y,
-            }
-        };
-        const cl_int2 paddings =
-        {
-            {
-                pad_x,
-                pad_y,
-            }
-        };
-
-        // Set static kernel arguments
-        unsigned int idx = num_arguments_per_2D_tensor() + num_arguments_per_3D_tensor();
-        _kernel.setArg<cl_int>(idx++, _kernel_size);
-        _kernel.setArg<cl_int>(idx++, input->info()->dimension(2) /* depth */);
-        _kernel.setArg<cl_int>(idx++, _convolved_dims.first /* output width */);
-        _kernel.setArg<cl_int2>(idx++, input_dims);
-        _kernel.setArg<cl_int2>(idx++, strides);
-        _kernel.setArg<cl_int2>(idx++, paddings);
-
+            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("im2col_generic", build_opts));
+        }
         _run_func = &CLIm2ColKernel::run_generic;
     }
     else
@@ -122,7 +116,22 @@
     Window win = calculate_max_window(*input->info(), Steps());
     // The CLIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
     output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    if(!run_img2col_reduced)
+    {
+        // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension
+        win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start());
+    }
+
     ICLKernel::configure(win);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "im2col_";
+    _config_id += (run_img2col_reduced ? "reduced_" : "");
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
 }
 
 void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -136,22 +145,18 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
 
-    int pad_x    = 0;
-    int pad_y    = 0;
-    int stride_x = 0;
-    int stride_y = 0;
-    std::tie(pad_x, pad_y)       = _conv_info.pad();
-    std::tie(stride_x, stride_y) = _conv_info.stride();
-
     // Get initial windows
-    Window slice     = window.first_slice_window_3D();
-    Window slice_in  = window.first_slice_window_3D();
-    Window slice_out = window.first_slice_window_3D();
+    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    // Change the Z dimension's step back to 1
+    window_collapsed.set_dimension_step(Window::DimZ, 1);
+
+    Window slice     = window_collapsed.first_slice_window_3D();
+    Window slice_in  = window_collapsed.first_slice_window_3D();
+    Window slice_out = window_collapsed.first_slice_window_3D();
 
     // Setup slice
     slice.set(Window::DimX, Window::Dimension(0, static_cast<int>(_convolved_dims.first), 1));
     slice.set(Window::DimY, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
-    slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
 
     // Setup input slice
     // The first three dimensions of the input are increased by the inner loops
@@ -166,13 +171,15 @@
 
     do
     {
-        // Set inputs
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice_in);
         add_2D_tensor_argument(idx, _output, slice_out);
-        enqueue(queue, *this, slice);
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input->info()->dimension(2)));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[3]));
+        enqueue(queue, *this, slice, _lws_hint);
     }
-    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out) && window.slide_window_slice_3D(slice_in));
+    while(window_collapsed.slide_window_slice_3D(slice) && window_collapsed.slide_window_slice_3D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in));
 }
 
 void CLIm2ColKernel::run_reduced(const Window &window, cl::CommandQueue &queue)
@@ -181,7 +188,7 @@
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
 
     Window out_window;
-    out_window.use_tensor_dimensions(_output->info());
+    out_window.use_tensor_dimensions(_output->info()->tensor_shape());
 
     Window out_slice = out_window.first_slice_window_1D();
     Window in_slice  = window.first_slice_window_3D();

diff --git a/src/core/CL/kernels/CLL2NormalizeKernel.cpp b/src/core/CL/kernels/CLL2NormalizeKernel.cpp
new file mode 100644
index 0000000..3e0758c
--- /dev/null
+++ b/src/core/CL/kernels/CLL2NormalizeKernel.cpp

@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLL2NormalizeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/FixedPoint.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLL2NormalizeKernel::CLL2NormalizeKernel()
+    : _input(nullptr), _sum(nullptr), _output(nullptr), _axis(0), _epsilon(1e-12)
+{
+}
+
+void CLL2NormalizeKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, unsigned int axis, float epsilon)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    // Sum and output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+    _input   = input;
+    _sum     = sum;
+    _output  = output;
+    _axis    = axis;
+    _epsilon = epsilon;
+
+    const unsigned int num_elems_processed_per_iteration = 16;
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("l2_normalize", build_opts));
+
+    // Set epsilon argument
+    unsigned int idx = num_arguments_per_1D_tensor() * 3;
+    _kernel.setArg<cl_uint>(idx, _epsilon);
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLL2NormalizeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window window_sum(window);
+    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    Window in_slice  = window.first_slice_window_1D();
+    Window sum_slice = window_sum.first_slice_window_1D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_1D_tensor_argument(idx, _input, in_slice);
+        add_1D_tensor_argument(idx, _sum, sum_slice);
+        add_1D_tensor_argument(idx, _output, in_slice);
+        enqueue(queue, *this, in_slice);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+}

diff --git a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
index 794a1bc..508fb89 100644
--- a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp

@@ -101,7 +101,7 @@
     Window slice = window.first_slice_window_2D();
 
     Window matrix_b_window;
-    matrix_b_window.use_tensor_dimensions(_input1->info());
+    matrix_b_window.use_tensor_dimensions(_input1->info()->tensor_shape());
     Window slice_matrix_b = matrix_b_window.first_slice_window_3D();
 
     do

diff --git a/src/core/CL/kernels/CLMeanStdDevKernel.cpp b/src/core/CL/kernels/CLMeanStdDevKernel.cpp
index b0b748f..1bf831b 100644
--- a/src/core/CL/kernels/CLMeanStdDevKernel.cpp
+++ b/src/core/CL/kernels/CLMeanStdDevKernel.cpp

@@ -40,10 +40,15 @@
 using namespace arm_compute;
 
 CLMeanStdDevKernel::CLMeanStdDevKernel()
-    : _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr)
+    : _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr), _border_size(0)
 {
 }
 
+BorderSize CLMeanStdDevKernel::border_size() const
+{
+    return _border_size;
+}
+
 void CLMeanStdDevKernel::configure(const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared)
 {
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
@@ -83,6 +88,8 @@
     constexpr unsigned int num_elems_processed_per_iteration_x = 8;
     const unsigned int     num_elems_processed_per_iteration_y = input->info()->dimension(1);
 
+    _border_size = BorderSize(ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration_x) - input->info()->dimension(0));
+
     Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
     AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
     update_window_and_padding(win, input_access);

diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
new file mode 100644
index 0000000..9b4533b
--- /dev/null
+++ b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp

@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <climits>
+
+using namespace arm_compute;
+
+CLMinMaxLayerKernel::CLMinMaxLayerKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLMinMaxLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    TensorShape output_shape{ input->info()->tensor_shape() };
+    output_shape.set(Window::DimX, 2);
+    output_shape.remove_dimension(1);
+    output_shape.remove_dimension(1);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+    _input  = input;
+    _output = output;
+
+    const unsigned int num_elems_processed_per_iteration = 1;
+
+    std::set<std::string> build_opts;
+    build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+    build_opts.emplace("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+    build_opts.emplace("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("minmax_layer", build_opts));
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     output_access(output->info(), 0, 0, 2, output->info()->dimension(1));
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLMinMaxLayerKernel::reset(cl::CommandQueue &queue)
+{
+    _output->map(queue, true);
+
+    Window window_output;
+    window_output.use_tensor_dimensions(_output->info()->tensor_shape());
+    window_output.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_output.collapse_if_possible(ICLKernel::window(), 1);
+
+    Iterator output(_output, window_output);
+
+    // Reset output
+    execute_window_loop(window_output, [&](const Coordinates & id)
+    {
+        auto *ptr = reinterpret_cast<float *>(output.ptr());
+        ptr[0]    = std::numeric_limits<float>::max();
+        ptr[1]    = std::numeric_limits<float>::min();
+    },
+    output);
+
+    _output->unmap(queue);
+}
+
+void CLMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    // Collapse min/max batches
+    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
+    Window slice            = window_collapsed.first_slice_window_3D();
+    slice.set(Window::DimX, Window::Dimension(0, 1, 1));
+    slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+    slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    Window window_output;
+    window_output.use_tensor_dimensions(_output->info()->tensor_shape());
+    window_output.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_output.collapse_if_possible(ICLKernel::window(), 1);
+
+    Window output_slice = window_output.first_slice_window_1D();
+
+    do
+    {
+        unsigned int idx = 0;
+        // Set inputs
+        add_3D_tensor_argument(idx, _input, slice);
+        add_1D_tensor_argument(idx, _output, output_slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice) && window_output.slide_window_slice_1D(output_slice));
+}

diff --git a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
index 939a53b..5636592 100644
--- a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
+++ b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp

@@ -32,7 +32,27 @@
 
 #include <climits>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+inline int32_t FloatFlip(float val)
+{
+    static_assert(sizeof(float) == sizeof(int32_t), "Float must be same size as int32_t");
+    int32_t int_val = 0;
+
+    memcpy(&int_val, &val, sizeof(float));
+    int_val = (int_val >= 0) ? int_val : int_val ^ 0x7FFFFFFF;
+    return int_val;
+}
+
+inline float IFloatFlip(int32_t val)
+{
+    static_assert(sizeof(float) == sizeof(int32_t), "Float must be same size as int32_t");
+    float flt_val = 0.f;
+
+    val = (val >= 0) ? val : val ^ 0x7FFFFFFF;
+    memcpy(&flt_val, &val, sizeof(float));
+    return flt_val;
+}
 
 CLMinMaxKernel::CLMinMaxKernel()
     : _input(nullptr), _min_max(), _data_type_max_min()
@@ -41,7 +61,7 @@
 
 void CLMinMaxKernel::configure(const ICLImage *input, cl::Buffer *min_max)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
     ARM_COMPUTE_ERROR_ON(min_max == nullptr);
 
@@ -59,16 +79,33 @@
             _data_type_max_min[0] = SHRT_MAX;
             _data_type_max_min[1] = SHRT_MIN;
             break;
+        case DataType::F32:
+            _data_type_max_min[0] = FloatFlip(std::numeric_limits<float>::max());
+            _data_type_max_min[1] = FloatFlip(std::numeric_limits<float>::lowest());
+            break;
         default:
             ARM_COMPUTE_ERROR("You called with the wrong image data types");
     }
 
     // Set kernel build options
-    std::set<std::string> build_opts;
-    build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.emplace("-DDATA_TYPE_MAX=" + val_to_string<int>(_data_type_max_min[0]));
-    build_opts.emplace("-DDATA_TYPE_MIN=" + val_to_string<int>(_data_type_max_min[1]));
-    build_opts.emplace((0 != (num_elems_processed_per_iteration % max_cl_vector_width)) ? "-DNON_MULTIPLE_OF_16" : "");
+    std::set<std::string> build_opts{ "-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()) };
+
+    if(num_elems_processed_per_iteration % max_cl_vector_width != 0)
+    {
+        build_opts.emplace("-DNON_MULTIPLE_OF_16");
+    }
+
+    if(input->info()->data_type() == DataType::F32)
+    {
+        build_opts.emplace("-DDATA_TYPE_MAX=" + support::cpp11::to_string(std::numeric_limits<float>::max()));
+        build_opts.emplace("-DDATA_TYPE_MIN=" + support::cpp11::to_string(std::numeric_limits<float>::lowest()));
+        build_opts.emplace("-DIS_DATA_TYPE_FLOAT");
+    }
+    else
+    {
+        build_opts.emplace("-DDATA_TYPE_MAX=" + support::cpp11::to_string(_data_type_max_min[0]));
+        build_opts.emplace("-DDATA_TYPE_MIN=" + support::cpp11::to_string(_data_type_max_min[1]));
+    }
 
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("minmax", build_opts));
@@ -76,11 +113,11 @@
     // Set fixed arguments
     unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
     _kernel.setArg(idx++, *_min_max);
-    _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
+    _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(input->info()->dimension(0)));
 
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, ceil_to_multiple(num_elems_processed_per_iteration, 16)));
     ICLKernel::configure(win);
 }
 
@@ -100,6 +137,28 @@
         enqueue(queue, *this, slice);
     }
     while(window.slide_window_slice_2D(slice));
+
+    cl_int min = 0;
+    cl_int max = 0;
+    queue.enqueueReadBuffer(*_min_max, CL_TRUE /* blocking */, 0 * sizeof(cl_int), sizeof(cl_int), static_cast<int *>(&min));
+    queue.enqueueReadBuffer(*_min_max, CL_TRUE /* blocking */, 1 * sizeof(cl_int), sizeof(cl_int), static_cast<int *>(&max));
+
+    if(_input->info()->data_type() == DataType::F32)
+    {
+        std::array<float, 2> min_max =
+        {
+            {
+                IFloatFlip(min),
+                IFloatFlip(max)
+            }
+        };
+        queue.enqueueWriteBuffer(*_min_max, CL_TRUE /* blocking */, 0, min_max.size() * sizeof(float), min_max.data());
+    }
+    else
+    {
+        std::array<int32_t, 2> min_max = { { min, max } };
+        queue.enqueueWriteBuffer(*_min_max, CL_TRUE /* blocking */, 0, min_max.size() * sizeof(int32_t), min_max.data());
+    }
 }
 
 CLMinMaxLocationKernel::CLMinMaxLocationKernel()
@@ -109,7 +168,7 @@
 
 void CLMinMaxLocationKernel::configure(const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count, ICLCoordinates2DArray *min_loc, ICLCoordinates2DArray *max_loc)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
     ARM_COMPUTE_ERROR_ON(min_max == nullptr);
     ARM_COMPUTE_ERROR_ON(min_max_count == nullptr && min_loc == nullptr && max_loc == nullptr);
@@ -123,6 +182,10 @@
     build_opts.emplace((min_max_count != nullptr) ? "-DCOUNT_MIN_MAX" : "");
     build_opts.emplace((min_loc != nullptr) ? "-DLOCATE_MIN" : "");
     build_opts.emplace((max_loc != nullptr) ? "-DLOCATE_MAX" : "");
+    if(input->info()->data_type() == DataType::F32)
+    {
+        build_opts.emplace("-DIS_DATA_TYPE_FLOAT");
+    }
 
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("minmaxloc", build_opts));
@@ -167,3 +230,4 @@
     }
     while(window.slide_window_slice_2D(slice));
 }
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index 106a511..a744739 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/FixedPoint.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
@@ -35,7 +36,7 @@
 using namespace arm_compute;
 
 CLNormalizationLayerKernel::CLNormalizationLayerKernel()
-    : _input(nullptr), _squared_input(nullptr), _output(nullptr), _border_size(0)
+    : _input(nullptr), _output(nullptr), _border_size(0), _is_in_map(false)
 {
 }
 
@@ -44,48 +45,61 @@
     return _border_size;
 }
 
-void CLNormalizationLayerKernel::configure(const ICLTensor *input, const ICLTensor *squared_input, ICLTensor *output, NormalizationLayerInfo norm_info)
+void CLNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
     ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
     ARM_COMPUTE_ERROR_ON_MSG(norm_info.type() == NormType::IN_MAP_2D, "2D In-Map Normalization not implemented");
+    if(is_data_type_fixed_point(input->info()->data_type()))
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+        ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.beta(), input);
+        ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.kappa(), input);
+        ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.scale_coeff(), input);
+    }
+
+    _input  = input;
+    _output = output;
+
+    _is_in_map                      = (norm_info.type() != NormType::CROSS_MAP);
+    const unsigned int border_width = _is_in_map ? std::min(norm_info.norm_size() / 2, 3U) : 0;
+    _border_size                    = BorderSize(0, border_width);
+
+    const unsigned int num_elems_processed_per_iteration = (is_data_type_fixed_point(input->info()->data_type())) ? 16 : 4;
+    const unsigned int num_elems_read_per_iteration      = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
 
     // Set build options
     std::set<std::string> build_opts;
     build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-
-    _input         = input;
-    _squared_input = squared_input;
-    _output        = output;
-
-    const bool         is_in_map    = (norm_info.type() == NormType::IN_MAP_1D);
-    const unsigned int border_width = is_in_map ? std::min(norm_info.norm_size() / 2, 3U) : 0;
-    _border_size                    = BorderSize(0, border_width);
+    if(is_data_type_fixed_point(input->info()->data_type()))
+    {
+        build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
+    }
+    build_opts.emplace(("-DCOEFF=" + float_to_string_with_full_precision(norm_info.scale_coeff())));
+    build_opts.emplace(("-DBETA=" + float_to_string_with_full_precision(norm_info.beta())));
+    build_opts.emplace(("-DKAPPA=" + float_to_string_with_full_precision(norm_info.kappa())));
+    build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+    build_opts.emplace(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size() / 2)));
+    build_opts.emplace(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
 
     // Create kernel
     std::string kernel_name = (norm_info.type() == NormType::IN_MAP_1D) ? "normalization_layer_in_map_1D" : "normalization_layer_cross_map";
     _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
 
-    // Set kernel static arguments
-    unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
-    _kernel.setArg<cl_float>(idx++, norm_info.scale_coeff());
-    _kernel.setArg<cl_float>(idx++, norm_info.beta());
-    _kernel.setArg<cl_float>(idx++, norm_info.kappa());
-    _kernel.setArg<cl_uint>(idx++, norm_info.norm_size() / 2);
-
     // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration = (is_in_map) ? 4 : 1;
-    const unsigned int num_elems_read_per_iteration      = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
-
     Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
 
     AccessWindowHorizontal input_access(input->info(), -_border_size.left, num_elems_read_per_iteration);
-    AccessWindowHorizontal squared_input_access(squared_input->info(), -_border_size.left, num_elems_read_per_iteration);
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
-    update_window_and_padding(win, input_access, squared_input_access, output_access);
+    update_window_and_padding(win, input_access, output_access);
 
     output_access.set_valid_region(win, input->info()->valid_region());
 
@@ -97,15 +111,16 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    Window slice = window.first_slice_window_3D();
+    const int collapsed_dimension = _is_in_map ? Window::DimZ : 4;
+    Window    window_collapsed    = window.collapse_if_possible(ICLKernel::window(), collapsed_dimension);
+    Window    slice               = window_collapsed.first_slice_window_3D();
 
     do
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _squared_input, slice);
         add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice);
     }
-    while(window.slide_window_slice_3D(slice));
+    while(window_collapsed.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
index 84eb434..33c8b81 100644
--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp

@@ -48,12 +48,36 @@
 void CLPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
                                                 ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    // Auto initialize output if not initialized
+    {
+        set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+        if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+        {
+            set_format_if_unknown(*output->info(), Format::S16);
+        }
+        else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+        {
+            set_format_if_unknown(*output->info(), Format::F32);
+        }
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
                              "Output can only be U8 if both inputs are U8");
     ARM_COMPUTE_ERROR_ON_MSG(scale < 0, "Scale cannot be negative. ");
+    if(is_data_type_fixed_point(input1->info()->data_type()))
+    {
+        // All data types must be all QS8 or all QS16
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input1, input2, output);
+        ARM_COMPUTE_ERROR_ON_MSG(scale != 1, "Unsupported scaling factor for QS8/QS16. Scale must be 1.");
+    }
 
     _input1 = input1;
     _input2 = input2;
@@ -79,13 +103,28 @@
     if(is_data_type_float(input1->info()->data_type()) || is_data_type_float(input2->info()->data_type()))
     {
         scale_int    = -1;
-        compute_type = (DataType::F32 == input1->info()->data_type() || DataType::F32 == input2->info()->data_type()) ? "float" : "half";
+        compute_type = (input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32) ? "float" : "half";
         data_type    = "DATA_TYPE_FLOAT";
     }
     else
     {
-        compute_type = (DataType::S16 == input1->info()->data_type() || DataType::S16 == input2->info()->data_type()) ? "int" : "ushort";
-        data_type    = "DATA_TYPE_INT";
+        if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+        {
+            compute_type = "int";
+        }
+        else if(input1->info()->data_type() == DataType::QS8)
+        {
+            compute_type = "qs8";
+        }
+        else if(input1->info()->data_type() == DataType::QS16)
+        {
+            compute_type = "qs16";
+        }
+        else
+        {
+            compute_type = "ushort";
+        }
+        data_type = "DATA_TYPE_INT";
     }
 
     // Construct kernel name
@@ -96,6 +135,10 @@
     std::set<std::string> build_opts;
     build_opts.emplace((overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type())) ? "-DWRAP" : "-DSATURATE");
     build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz" : "-DROUND=_rte");
+    if(is_data_type_fixed_point(input1->info()->data_type()))
+    {
+        build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input1->info()->fixed_point_position()));
+    }
     build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
     build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
     build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
@@ -106,7 +149,7 @@
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
 
     // Set scale argument
-    unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the inputs and output parameters
+    unsigned int idx = 3 * num_arguments_per_3D_tensor(); //Skip the inputs and output parameters
 
     if(scale_int >= 0)
     {
@@ -140,15 +183,15 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    Window slice = window.first_slice_window_2D();
+    Window slice = window.first_slice_window_3D();
 
     do
     {
         unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input1, slice);
-        add_2D_tensor_argument(idx, _input2, slice);
-        add_2D_tensor_argument(idx, _output, slice);
+        add_3D_tensor_argument(idx, _input1, slice);
+        add_3D_tensor_argument(idx, _input2, slice);
+        add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice);
     }
-    while(window.slide_window_slice_2D(slice));
+    while(window.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index dc5ae4e..497e87b 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp

@@ -41,7 +41,7 @@
 using namespace arm_compute;
 
 CLPoolingLayerKernel::CLPoolingLayerKernel()
-    : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0)
+    : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0), _num_elems_processed_per_iteration(1)
 {
 }
 
@@ -52,103 +52,126 @@
 
 void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info)
 {
-    int                   pool_pad_x      = 0;
-    int                   pool_pad_y      = 0;
-    int                   pool_stride_x   = 0;
-    int                   pool_stride_y   = 0;
-    unsigned int          pooled_w        = 0;
-    unsigned int          pooled_h        = 0;
-    const PoolingType     pool_type       = pool_info.pool_type();
-    const int             pool_size       = pool_info.pool_size();
-    const PadStrideInfo   pad_stride_info = pool_info.pad_stride_info();
-    DimensionRoundingType pool_round      = pad_stride_info.round();
+    int                 pool_pad_x      = 0;
+    int                 pool_pad_y      = 0;
+    int                 pool_stride_x   = 0;
+    int                 pool_stride_y   = 0;
+    unsigned int        pooled_w        = 0;
+    unsigned int        pooled_h        = 0;
+    const PoolingType   pool_type       = pool_info.pool_type();
+    const int           pool_size       = pool_info.pool_size();
+    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
     std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
 
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON(2 != pool_size && 3 != pool_size);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
+    ARM_COMPUTE_ERROR_ON(pool_size > 7 && is_data_type_fixed_point(input->info()->data_type()));
 
     // Check output dimensions
     std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
                                                      input->info()->dimension(1),
                                                      pool_size,
-                                                     pool_stride_x, pool_stride_y,
-                                                     pool_pad_x, pool_pad_y,
-                                                     pool_round);
-    ARM_COMPUTE_UNUSED(pooled_w);
-    ARM_COMPUTE_UNUSED(pooled_h);
+                                                     pool_size,
+                                                     pool_info.pad_stride_info());
+
+    // Output auto initialization if not yet initialized
+    {
+        TensorShape output_shape{ input->info()->tensor_shape() };
+        output_shape.set(0, pooled_w);
+        output_shape.set(1, pooled_h);
+
+        auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h));
 
-    const int input_width   = input->info()->dimension(0);
-    const int input_height  = input->info()->dimension(1);
-    const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width;
-    const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+    const int input_width  = input->info()->dimension(0);
+    const int input_height = input->info()->dimension(1);
 
     // Set instance variables
-    _input              = input;
-    _output             = output;
-    _pool_info          = pool_info;
-    _border_size        = BorderSize(pool_pad_y, pool_pad_x);
-    _border_size.right  = std::max(upper_bound_w, pool_pad_x);
-    _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
+    _input       = input;
+    _output      = output;
+    _pool_info   = pool_info;
+    _border_size = BorderSize(pool_pad_y, pool_pad_x);
 
     // Set build options
     std::set<std::string> build_opts;
     build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    build_opts.emplace(("-DPOOL_" + ((PoolingType::MAX == pool_type) ? std::string("MAX") : std::string("AVG"))));
+    build_opts.emplace(("-DPOOL_" + string_from_pooling_type(pool_type)));
+    if(is_data_type_fixed_point(input->info()->data_type()))
+    {
+        build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+    }
+
+    build_opts.emplace(("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x)));
+    if(pool_type != PoolingType::MAX)
+    {
+        build_opts.emplace(("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0) + pool_pad_x)));
+        build_opts.emplace(("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1) + pool_pad_y)));
+        build_opts.emplace(("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y)));
+        build_opts.emplace(("-DPAD_X=" + support::cpp11::to_string(pool_pad_x)));
+        build_opts.emplace(("-DPAD_Y=" + support::cpp11::to_string(pool_pad_y)));
+    }
 
     // Create kernel
-    std::string kernel_name = "pooling_layer_" + val_to_string(pool_size);
-    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
-
-    // Set static kernel arguments
-    if(pool_type == PoolingType::AVG)
+    if(pool_size <= 7)
     {
-        // Create static kernel arguments
-        const cl_int2 max_dims =
-        {
-            {
-                static_cast<cl_int>(input->info()->dimension(0)) + pool_pad_x,
-                static_cast<cl_int>(input->info()->dimension(1)) + pool_pad_y,
-            }
-        };
-        const cl_int2 strides =
-        {
-            {
-                pool_stride_x,
-                pool_stride_y,
-            }
-        };
-        const cl_int2 paddings =
-        {
-            {
-                pool_pad_x,
-                pool_pad_y,
-            }
-        };
+        // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where
+        // each thread computes 4 output elements
+        const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(input->info()->data_type());
 
-        // Set static kernel arguments
-        unsigned int idx = 2 * num_arguments_per_3D_tensor();
-        _kernel.setArg<cl_int2>(idx++, max_dims);
-        _kernel.setArg<cl_int2>(idx++, strides);
-        _kernel.setArg<cl_int2>(idx++, paddings);
+        int num_elements_read_per_iteration = (pool_size == 7) ? 8 : pool_size;
+        if(is_pool3x3_stride_le3)
+        {
+            // Change the number of elements processed and number of elements read per iteration for pooling 3x3 with stride less equal than 3
+            _num_elems_processed_per_iteration = 4;
+            num_elements_read_per_iteration    = pool_size * (pool_stride_x + 1);
+        }
+
+        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elements_read_per_iteration) - input_width;
+        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+
+        _border_size.right  = std::max(upper_bound_w, pool_pad_x);
+        _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
+
+        std::string kernel_name = "pooling_layer_" + support::cpp11::to_string(pool_size);
+        if(is_pool3x3_stride_le3)
+        {
+            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name + "_optimized", build_opts));
+        }
+        else
+        {
+            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+        }
+    }
+    else // Run general case
+    {
+        _num_elems_processed_per_iteration = 1;
+
+        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width;
+        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+
+        _border_size.right  = std::max(upper_bound_w, pool_pad_x);
+        _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
+
+        build_opts.emplace(("-DPOOL_SIZE=" + support::cpp11::to_string(pool_size)));
+        if(input->info()->data_type() == DataType::F16)
+        {
+            build_opts.emplace("-DFP16");
+        }
+        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("pooling_layer_N", build_opts));
     }
 
     // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration = 1;
-
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
+    Window                 win = calculate_max_window(*output->info(), Steps(_num_elems_processed_per_iteration));
     AccessWindowStatic     input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
+    AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
     update_window_and_padding(win, input_access, output_access);
-
     output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
     ICLKernel::configure(win);
 }
 
@@ -161,13 +184,14 @@
     std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
 
-    Window slice = window.first_slice_window_3D();
+    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice            = window_collapsed.first_slice_window_3D();
 
     do
     {
         // Upsample input by pool size
         Window in_slice(slice);
-        in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x, in_slice.x().end() * pool_stride_x, pool_stride_x));
+        in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x, in_slice.x().end() * pool_stride_x, pool_stride_x * _num_elems_processed_per_iteration));
         in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - pool_pad_y, in_slice.y().end() * pool_stride_y, pool_stride_y));
 
         // Set inputs
@@ -176,5 +200,5 @@
         add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice);
     }
-    while(window.slide_window_slice_3D(slice));
+    while(window_collapsed.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
new file mode 100644
index 0000000..4756443
--- /dev/null
+++ b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp

@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLQuantizationLayerKernel::CLQuantizationLayerKernel()
+    : _input(nullptr), _output(nullptr), _min_max(nullptr)
+{
+}
+
+void CLQuantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *min_max)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output, min_max);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, DataType::U8, 0);
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+    _input   = input;
+    _output  = output;
+    _min_max = min_max;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("quantization_layer"));
+
+    // Configure window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     min_max_access(min_max->info(), 0, 0, 2, min_max->info()->dimension(1));
+
+    // Update window and padding
+    update_window_and_padding(win, input_access, output_access, min_max_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLQuantizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
+    Window slice            = window_collapsed.first_slice_window_3D();
+
+    Window window_min_max;
+    window_min_max.use_tensor_dimensions(_min_max->info()->tensor_shape());
+    window_min_max.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_min_max.collapse_if_possible(ICLKernel::window(), 1);
+
+    Window slice_min_max = window_min_max.first_slice_window_1D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        add_1D_tensor_argument(idx, _min_max, slice_min_max);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice) && window_min_max.slide_window_slice_1D(slice_min_max));
+}

diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
new file mode 100644
index 0000000..4e000c6
--- /dev/null
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp

@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLROIPoolingLayerKernel::CLROIPoolingLayerKernel()
+    : _input(nullptr), _rois(nullptr), _output(nullptr), _pool_info(0, 0, 0.f)
+{
+}
+
+void CLROIPoolingLayerKernel::configure(const ICLTensor *input, const ICLROIArray *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, rois, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
+    ARM_COMPUTE_ERROR_ON(rois->num_values() == 0);
+
+    // Output auto inizialitation if not yet initialized
+    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->num_values());
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(rois->num_values() != output->info()->dimension(3));
+
+    // Set instance variables
+    _input     = input;
+    _rois      = rois;
+    _output    = output;
+    _pool_info = pool_info;
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.emplace(("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type())));
+    build_opts.emplace(("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(Window::DimX))));
+    build_opts.emplace(("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(Window::DimY))));
+    build_opts.emplace(("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(Window::DimZ))));
+    build_opts.emplace(("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width())));
+    build_opts.emplace(("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height())));
+    build_opts.emplace(("-DSPATIAL_SCALE=" + support::cpp11::to_string(pool_info.spatial_scale())));
+
+    // Create kernel
+    std::string kernel_name = "roi_pooling_layer";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Set static kernel arguments
+    unsigned int idx = 2 * num_arguments_per_3D_tensor() + num_arguments_per_1D_array();
+    add_argument<cl_uint>(idx, _input->info()->strides_in_bytes()[3]);
+    add_argument<cl_uint>(idx, _output->info()->strides_in_bytes()[3]);
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = 1;
+    Window             window                            = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowStatic input_access(input->info(),
+                                    input->info()->valid_region().start(0),
+                                    input->info()->valid_region().start(1),
+                                    input->info()->valid_region().end(0),
+                                    input->info()->valid_region().end(1));
+    AccessWindowStatic output_access(output->info(), 0, 0, pool_info.pooled_width(), pool_info.pooled_height());
+
+    update_window_and_padding(window, input_access, output_access);
+    output_access.set_valid_region(window, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    ICLKernel::configure(window);
+}
+
+void CLROIPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice(window);
+    // Parallelize spatially and across the fourth dimension of the output tensor (also across ROIArray)
+    slice.set(Window::DimZ, window[3]);
+
+    // Set arguments
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, slice);
+    add_1D_array_argument<ROI>(idx, _rois, Strides(sizeof(ROI)), 1U, slice);
+    add_3D_tensor_argument(idx, _output, slice);
+    enqueue(queue, *this, slice);
+}

diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
new file mode 100644
index 0000000..18a8e35
--- /dev/null
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp

@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/FixedPoint.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLReductionOperationKernel::CLReductionOperationKernel()
+    : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::SUM_SQUARE), _border_size()
+{
+}
+
+BorderSize CLReductionOperationKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    // Output tensor auto initialization if not yet initialized
+    TensorShape output_shape{ input->info()->tensor_shape() };
+    output_shape.set(axis, 1);
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
+
+    const unsigned int num_elems_processed_per_iteration = 16;
+    const unsigned int border_width                      = ((input->info()->dimension(0) % 128) != 0) ? 128 - input->info()->dimension(0) % 128 : 0;
+
+    _input          = input;
+    _output         = output;
+    _reduction_axis = axis;
+    _op             = op;
+    _lws_hint       = cl::NDRange(8);
+    _border_size    = BorderSize(0, border_width, 0, 0);
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+    if(is_data_type_fixed_point(input->info()->data_type()))
+    {
+        build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+    }
+
+    switch(op)
+    {
+        case ReductionOperation::SUM_SQUARE:
+            build_opts.emplace(("-DOPERATION=square_sum"));
+            break;
+        case ReductionOperation::SUM:
+            build_opts.emplace(("-DOPERATION=sum"));
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported reduction operation");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reduction_operation", build_opts));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic     input_access(input->info(), 0, 0, input->info()->dimension(0) + border_width, 1);
+    AccessWindowHorizontal output_access(output->info(), 0, 1);
+
+    update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, output->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    // Set out window
+    Window out_window(window);
+    out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    // Get first input and output slices
+    Window in_slice  = window.first_slice_window_1D();
+    Window out_slice = out_window.first_slice_window_1D();
+
+    // Reshape window
+    const unsigned int border_width = ((in_slice.x().end() % 128) != 0) ? 128 - in_slice.x().end() % 128 : 0;
+    in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step()));
+
+    // Set local sums buffer
+    unsigned int local_sum_size = _lws_hint[0] * _input->info()->element_size();
+    _kernel.setArg(num_arguments_per_1D_tensor() * 2, local_sum_size, nullptr);
+
+    do
+    {
+        unsigned int idx = 0;
+        add_1D_tensor_argument(idx, _input, in_slice);
+        add_1D_tensor_argument(idx, _output, out_slice);
+        enqueue(queue, *this, in_slice, _lws_hint);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(out_slice));
+}

diff --git a/src/core/CL/kernels/CLReshapeLayerKernel.cpp b/src/core/CL/kernels/CLReshapeLayerKernel.cpp
new file mode 100644
index 0000000..0131bd3
--- /dev/null
+++ b/src/core/CL/kernels/CLReshapeLayerKernel.cpp

@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <string>
+
+using namespace arm_compute;
+
+CLReshapeLayerKernel::CLReshapeLayerKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLReshapeLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16,
+                                                  DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size());
+
+    _input  = input;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+    // Create kernel
+    std::set<std::string> build_opts = { "-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()) };
+    _kernel                          = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reshape_layer", build_opts));
+
+    // Add static arguments
+    const cl_int2 input_shape =
+    {
+        {
+            static_cast<cl_int>(_input->info()->tensor_shape()[0]),
+            static_cast<cl_int>(_input->info()->tensor_shape()[1])
+        }
+    };
+    const cl_int2 output_shape =
+    {
+        {
+            static_cast<cl_int>(_output->info()->tensor_shape()[0]),
+            static_cast<cl_int>(_output->info()->tensor_shape()[1])
+        }
+    };
+    unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+    _kernel.setArg<cl_int2>(idx++, input_shape);
+    _kernel.setArg<cl_int2>(idx++, output_shape);
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     output_access(output->info(), 0, 0, output->info()->tensor_shape().x(), output->info()->tensor_shape().y());
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLReshapeLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice            = window_collapsed.first_slice_window_3D();
+
+    // Set inputs
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, window_collapsed);
+    add_3D_tensor_argument(idx, _output, window_collapsed);
+    enqueue(queue, *this, slice);
+}

diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index d74e837..82ebe64 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp

@@ -46,9 +46,10 @@
 
 void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, bool border_undefined)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(output == input);
 
     _input  = input;
     _output = output;
@@ -76,24 +77,33 @@
 
     // Configure kernel window
     constexpr unsigned int num_elems_processed_per_iteration = 4;
-    const int              border_offset                     = (border_undefined) ? 0 : border_size().left;
 
     Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
 
-    AccessWindowStatic input_access(input->info(), -border_offset, -border_offset,
-                                    input->info()->dimension(0) + border_offset, input->info()->dimension(1) + border_offset);
+    const ValidRegion &input_valid_region = input->info()->valid_region();
+
+    // Reads can occur within the valid region of the input
+    AccessWindowStatic input_access(input->info(),
+                                    input_valid_region.anchor[0] - border_size().left, input_valid_region.anchor[1] - border_size().top,
+                                    input_valid_region.anchor[0] + input_valid_region.shape[0] + border_size().right,
+                                    input_valid_region.anchor[1] + input_valid_region.shape[1] + border_size().bottom);
+
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
     update_window_and_padding(win, input_access, output_access);
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    output_access.set_valid_region(win, calculate_valid_region_scale(*(input->info()), output->info()->tensor_shape(), policy, border_size(),
+                                                                     border_undefined));
 
     ICLKernel::configure(win);
 
     // Set static kernel arguments
+    const float scale_x = static_cast<float>(input->info()->dimension(0)) / output->info()->dimension(0);
+    const float scale_y = static_cast<float>(input->info()->dimension(1)) / output->info()->dimension(1);
+
     unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
     _kernel.setArg<float>(idx++, input->info()->dimension(0));
     _kernel.setArg<float>(idx++, input->info()->dimension(1));
-    _kernel.setArg<float>(idx++, output->info()->dimension(0));
-    _kernel.setArg<float>(idx++, output->info()->dimension(1));
+    _kernel.setArg<float>(idx++, scale_x);
+    _kernel.setArg<float>(idx++, scale_y);
 }

diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
index 0470d52..da3b942 100644
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp

@@ -41,9 +41,19 @@
 
 void CLLogits1DMaxKernel::configure(const ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    // Softmax across the x dimension
+    TensorShape output_shape{ input->info()->tensor_shape() };
+    output_shape.set(0, 1);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
 
     _input  = input;
     _output = output;
@@ -52,7 +62,16 @@
     const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16);
 
     // Set build options
-    std::set<std::string> build_opts{ "-DUSE_" + string_from_data_type(input->info()->data_type()) };
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    if(is_data_type_fixed_point(input->info()->data_type()))
+    {
+        build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
+    }
+    else if(input->info()->data_type() == DataType::F16)
+    {
+        build_opts.emplace("-DUSE_F16");
+    }
 
     // Tell the kernel that the width is not a multiple of 16
     if((input->info()->dimension(0) % max_cl_vector_width) != 0)
@@ -64,7 +83,7 @@
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_max", build_opts));
 
     // Set fixed arguments
-    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    unsigned int idx = 2 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
     _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
 
     // Configure kernel window
@@ -88,11 +107,17 @@
 
 void CLLogits1DShiftExpSumKernel::configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(max, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(max, sum, output);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*sum->info(), max->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, max, sum);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output, max, sum);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(max, sum);
 
     _input  = input;
     _max    = max;
@@ -103,7 +128,16 @@
     const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16);
 
     // Set build options
-    std::set<std::string> build_opts{ "-DUSE_" + string_from_data_type(input->info()->data_type()) };
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    if(is_data_type_fixed_point(input->info()->data_type()))
+    {
+        build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
+    }
+    else if(input->info()->data_type() == DataType::F16)
+    {
+        build_opts.emplace("-DUSE_F16");
+    }
 
     // Tell the kernel that the width is not a multiple of 16
     if((input->info()->dimension(0) % max_cl_vector_width) != 0)
@@ -115,7 +149,7 @@
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_shift_exp_sum", build_opts));
 
     // Set fixed arguments
-    unsigned int idx = 4 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    unsigned int idx = 4 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
     _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
 
     // Configure window
@@ -139,19 +173,20 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    Window slice = window.first_slice_window_2D();
+    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice            = window_collapsed.first_slice_window_3D();
 
     do
     {
         unsigned int idx = 0;
         // Set inputs
-        add_2D_tensor_argument(idx, _input, slice);
-        add_2D_tensor_argument(idx, _max, slice);
-        add_2D_tensor_argument(idx, _output, slice);
-        add_2D_tensor_argument(idx, _sum, slice);
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _max, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        add_3D_tensor_argument(idx, _sum, slice);
         enqueue(queue, *this, slice);
     }
-    while(window.slide_window_slice_2D(slice));
+    while(window_collapsed.slide_window_slice_3D(slice));
 }
 
 CLLogits1DNormKernel::CLLogits1DNormKernel()
@@ -161,10 +196,15 @@
 
 void CLLogits1DNormKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, sum);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(sum, output);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, sum, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
 
     _input  = input;
     _sum    = sum;
@@ -172,7 +212,11 @@
 
     // Set build options
     std::set<std::string> build_opts;
-    build_opts.emplace(("-DUSE_" + string_from_data_type(input->info()->data_type())));
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    if(is_data_type_fixed_point(input->info()->data_type()))
+    {
+        build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
+    }
 
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_norm", build_opts));
@@ -198,7 +242,8 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    Window slice = window.first_slice_window_2D();
+    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice            = window_collapsed.first_slice_window_3D();
 
     do
     {
@@ -207,10 +252,10 @@
 
         unsigned int idx = 0;
         // Set inputs
-        add_2D_tensor_argument(idx, _input, slice);
-        add_2D_tensor_argument(idx, _sum, sum_slice);
-        add_2D_tensor_argument(idx, _output, slice);
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _sum, sum_slice);
+        add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice);
     }
-    while(window.slide_window_slice_2D(slice));
+    while(window_collapsed.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
index 2ee6fcb..75d31d5 100644
--- a/src/core/CL/kernels/CLTransposeKernel.cpp
+++ b/src/core/CL/kernels/CLTransposeKernel.cpp

@@ -40,8 +40,9 @@
 
 void CLTransposeKernel::configure(const ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(output == nullptr);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16,
+                                                  DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
     TensorShape  output_shape{ input->info()->tensor_shape() };
     const size_t w_out = input->info()->dimension(1);
@@ -52,8 +53,9 @@
     // Output tensor auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
 
     _input    = input;
     _output   = output;

diff --git a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
index fddb580..a47952f 100644
--- a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
+++ b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp

@@ -88,8 +88,8 @@
 
     Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
 
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1));
+    AccessWindowStatic     input_access(input->info(), -border_size().left, -border_size().top, input->info()->dimension(0) + border_size().right, input->info()->dimension(1) + border_size().bottom);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
     update_window_and_padding(win, input_access, output_access);
 

diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
index 018f272..bc27477 100644
--- a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
+++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp

@@ -34,31 +34,40 @@
 
 using namespace arm_compute;
 
-CLWeightsReshapeKernel::CLWeightsReshapeKernel(bool is_shared)
-    : _is_shared(is_shared), _input(nullptr), _biases(nullptr), _output(nullptr)
+CLWeightsReshapeKernel::CLWeightsReshapeKernel()
+    : _input(nullptr), _biases(nullptr), _output(nullptr)
 {
 }
 
 void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-    if(_is_shared)
-    {
-        ARM_COMPUTE_ERROR_ON(input->info()->dimension(4) != (output->info()->dimension(2)));
-        ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 5);
-        ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 3);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 4);
-        ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 2);
-    }
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
-    // Check biases
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    TensorShape output_shape{ input->info()->tensor_shape() };
+    output_shape.collapse(3);
+    const size_t tmp_dim = output_shape[0];
+    output_shape.set(0, output_shape[1]);
+    output_shape.set(1, tmp_dim + (biases != nullptr ? 1 : 0));
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, dt, fixed_point_position);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
     if(biases != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
+        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (biases->info()->num_dimensions() != 1));
+        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (biases->info()->num_dimensions() != 2));
+        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (biases->info()->dimension(0) != input->info()->tensor_shape()[3]));
+        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (biases->info()->dimension(0) != input->info()->tensor_shape()[3] || biases->info()->dimension(1) != input->info()->tensor_shape()[4]));
     }
 
     _biases = biases;
@@ -69,6 +78,10 @@
     std::set<std::string> build_opts;
     build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
     build_opts.emplace(((biases != nullptr) ? "-DHAS_BIAS" : ""));
+    if(is_data_type_fixed_point(input->info()->data_type()))
+    {
+        build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+    }
 
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reshape_to_columns", build_opts));
@@ -88,49 +101,13 @@
     ICLKernel::configure(win);
 }
 
-CLConvolutionLayerWeightsReshapeKernel::CLConvolutionLayerWeightsReshapeKernel()
-    : CLWeightsReshapeKernel(false)
-{
-}
-
-void CLConvolutionLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
 
     Window out_window;
-    out_window.use_tensor_dimensions(_output->info());
-
-    Window in_slice  = window.first_slice_window_3D();
-    Window out_slice = out_window.first_slice_window_2D();
-
-    // Set arguments
-    unsigned idx = 0;
-    add_3D_tensor_argument(idx, _input, in_slice);
-    add_2D_tensor_argument(idx, _output, out_slice);
-    if(_biases != nullptr)
-    {
-        Window biases_slice;
-        biases_slice.set(Window::DimX, Window::Dimension(0, _biases->info()->tensor_shape().x(), 1));
-        add_1D_tensor_argument(idx, _biases, biases_slice);
-    }
-
-    // Run kernel
-    enqueue(queue, *this, in_slice);
-}
-
-CLLocallyConnectedLayerWeightsReshapeKernel::CLLocallyConnectedLayerWeightsReshapeKernel()
-    : CLWeightsReshapeKernel(true)
-{
-}
-
-void CLLocallyConnectedLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-    Window out_window;
-    out_window.use_tensor_dimensions(_output->info());
+    out_window.use_tensor_dimensions(_output->info()->tensor_shape());
 
     Window in_slice  = window.first_slice_window_3D();
     Window out_slice = out_window.first_slice_window_2D();
@@ -140,7 +117,7 @@
 
     if(_biases != nullptr)
     {
-        biases_window.use_tensor_dimensions(_biases->info());
+        biases_window.use_tensor_dimensions(_biases->info()->tensor_shape());
         biases_slice = biases_window.first_slice_window_1D();
     }
 

diff --git a/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp b/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp
index 884da28..418d349 100644
--- a/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp
+++ b/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp

@@ -37,12 +37,12 @@
 
 namespace
 {
-inline void check_corner(float x, float y, float strength, InternalKeypoint *output, int32_t *num_corner_candidates, std::mutex *corner_candidates_mutex)
+inline void check_corner(float x, float y, float strength, InternalKeypoint *output, int32_t *num_corner_candidates, arm_compute::Mutex *corner_candidates_mutex)
 {
     if(strength != 0.0f)
     {
         /* Set index and update num_corner_candidate */
-        std::unique_lock<std::mutex> lock(*corner_candidates_mutex);
+        std::unique_lock<arm_compute::Mutex> lock(*corner_candidates_mutex);
 
         const int32_t idx = *num_corner_candidates;
 
@@ -55,12 +55,9 @@
     }
 }
 
-inline void corner_candidates(const float *__restrict input, InternalKeypoint *__restrict output, int32_t x, int32_t y, int32_t *num_corner_candidates, std::mutex *corner_candidates_mutex)
+inline void corner_candidates(const float *__restrict input, InternalKeypoint *__restrict output, int32_t x, int32_t y, int32_t *num_corner_candidates, arm_compute::Mutex *corner_candidates_mutex)
 {
-    check_corner(x + 0, y, *(input + 0), output, num_corner_candidates, corner_candidates_mutex);
-    check_corner(x + 1, y, *(input + 1), output, num_corner_candidates, corner_candidates_mutex);
-    check_corner(x + 2, y, *(input + 2), output, num_corner_candidates, corner_candidates_mutex);
-    check_corner(x + 3, y, *(input + 3), output, num_corner_candidates, corner_candidates_mutex);
+    check_corner(x, y, *input, output, num_corner_candidates, corner_candidates_mutex);
 }
 } // namespace
 
@@ -86,7 +83,7 @@
     _output                = output;
     _num_corner_candidates = num_corner_candidates;
 
-    const unsigned int num_elems_processed_per_iteration = 4;
+    const unsigned int num_elems_processed_per_iteration = 1;
 
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
@@ -96,8 +93,9 @@
     INEKernel::configure(win);
 }
 
-void CPPCornerCandidatesKernel::run(const Window &window)
+void CPPCornerCandidatesKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     Iterator input(_input, window);

diff --git a/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
index 62bfdd6..ebe3db9 100644
--- a/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
+++ b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp

@@ -59,8 +59,9 @@
     IKernel::configure(Window()); // Default 1 iteration window
 }
 
-void CPPDetectionWindowNonMaximaSuppressionKernel::run(const Window &window)
+void CPPDetectionWindowNonMaximaSuppressionKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_input_output->buffer() == nullptr);

diff --git a/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp b/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp
index 09d3ccf..3b1c7ae 100644
--- a/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp
+++ b/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp

@@ -68,8 +68,9 @@
     return false;
 }
 
-void CPPSortEuclideanDistanceKernel::run(const Window &window)
+void CPPSortEuclideanDistanceKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICPPKernel::window(), window);
 

diff --git a/src/core/Helpers.cpp b/src/core/Helpers.cpp
index ff903e9..fc0b6e9 100644
--- a/src/core/Helpers.cpp
+++ b/src/core/Helpers.cpp

@@ -50,7 +50,7 @@
                    anchor[0] + border_size.left,
                    // Skip the border right of the image
                    // Make sure the window width is a multiple of the step size
-                   anchor[0] + border_size.left + ceil_to_multiple(shape[0] - border_size.left - border_size.right, steps[0]),
+                   anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
                    steps[0]));
 
     size_t             n            = 1;
@@ -62,7 +62,7 @@
                        // Skip the border above the image
                        anchor[1] + border_size.top,
                        // Skip the border below the image
-                       anchor[1] + border_size.top + ceil_to_multiple(shape[1] - border_size.top - border_size.bottom, steps[1]),
+                       anchor[1] + border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) - static_cast<int>(border_size.bottom)), steps[1]),
                        steps[1]));
 
         ++n;
@@ -137,7 +137,7 @@
                    anchor[0] + border_size.left,
                    // Skip the border right of the image
                    // Make sure the window width is a multiple of the step size
-                   anchor[0] + border_size.left + ceil_to_multiple(shape[0] - border_size.left - border_size.right, steps[0]),
+                   anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
                    steps[0]));
 
     size_t             n            = 1;

diff --git a/src/core/IAccessWindow.cpp b/src/core/IAccessWindow.cpp
index 4ddc0fe..693d851 100644
--- a/src/core/IAccessWindow.cpp
+++ b/src/core/IAccessWindow.cpp

@@ -213,8 +213,8 @@
     PaddingSize padding;
     padding.left   = std::max(0, -min_x);
     padding.right  = std::max<int>(0, max_x - shape[0]);
-    padding.top    = shape.num_dimensions() == 1 ? 0 : std::max(0, -min_y);
-    padding.bottom = shape.num_dimensions() == 1 ? 0 : std::max<int>(0, max_y - shape[1]);
+    padding.top    = std::max(0, -min_y);
+    padding.bottom = std::max<int>(0, max_y - shape[1]);
 
     // Update strides in tensor info
     return _info->extend_padding(padding);

diff --git a/src/core/ITensor.cpp b/src/core/ITensor.cpp
index 0b29eca..4a54675 100644
--- a/src/core/ITensor.cpp
+++ b/src/core/ITensor.cpp

@@ -55,9 +55,9 @@
     dst_info->set_valid_region(src_info->valid_region());
 
     Window win_src;
-    win_src.use_tensor_dimensions(src_info, Window::DimY);
+    win_src.use_tensor_dimensions(src_info->tensor_shape(), Window::DimY);
     Window win_dst;
-    win_dst.use_tensor_dimensions(dst_info, Window::DimY);
+    win_dst.use_tensor_dimensions(dst_info->tensor_shape(), Window::DimY);
 
     Iterator src_it(&src, win_src);
     Iterator dst_it(this, win_dst);
@@ -147,4 +147,4 @@
             s << io_fmt.row_delim;
         }
     }
-}
\ No newline at end of file
+}

diff --git a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
index edb0a0f..e0c2891 100644
--- a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
+++ b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp

@@ -201,8 +201,9 @@
     INEKernel::configure(win);
 }
 
-void NEAbsoluteDifferenceKernel::run(const Window &window)
+void NEAbsoluteDifferenceKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);

diff --git a/src/core/NEON/kernels/NEAccumulateKernel.cpp b/src/core/NEON/kernels/NEAccumulateKernel.cpp
index e5b933a..deafabe 100644
--- a/src/core/NEON/kernels/NEAccumulateKernel.cpp
+++ b/src/core/NEON/kernels/NEAccumulateKernel.cpp

@@ -114,8 +114,9 @@
 }
 } // namespace fp16
 
-void NEAccumulateWeightedFP16Kernel::run(const Window &window)
+void NEAccumulateWeightedFP16Kernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
 
@@ -131,7 +132,7 @@
     },
     input, accum);
 }
-#endif
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
 
 namespace
 {
@@ -262,8 +263,9 @@
     INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
 }
 
-void NEAccumulateKernel::run(const Window &window)
+void NEAccumulateKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
     Iterator input(_input, window);
@@ -300,8 +302,9 @@
     INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
 }
 
-void NEAccumulateWeightedKernel::run(const Window &window)
+void NEAccumulateWeightedKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
 
@@ -342,8 +345,9 @@
     INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
 }
 
-void NEAccumulateSquaredKernel::run(const Window &window)
+void NEAccumulateSquaredKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
     Iterator input(_input, window);

diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index a878078..67fc45b 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp

@@ -41,21 +41,29 @@
 using namespace arm_compute;
 
 NEActivationLayerKernel::NEActivationLayerKernel()
-    : _func(nullptr), _act_info(ActivationFunction::LOGISTIC)
+    : _input(nullptr), _output(nullptr), _func(nullptr), _act_info(ActivationFunction::LOGISTIC)
 {
 }
 
-void NEActivationLayerKernel::configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
+void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
 
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+    _input    = input;
+    _act_info = activation_info;
+    _output   = input;
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    if(output != nullptr)
+    {
+        // Output auto inizialitation if not yet initialized
+        auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+        _output = output;
+    }
 
     // Activation functions : FP32
     static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f32 =
@@ -65,12 +73,31 @@
         { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, float> },
         { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, float> },
         { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, float> },
+        { ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LU_BOUNDED_RELU, float> },
+        { ActivationFunction::LEAKY_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LEAKY_RELU, float> },
         { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU, float> },
         { ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT, float> },
         { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, float> },
         { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, float> },
     };
 
+#ifdef ARM_COMPUTE_ENABLE_FP16
+    // Activation functions : FP16
+    static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f16 =
+    {
+        { ActivationFunction::ABS, &NEActivationLayerKernel::activation<ActivationFunction::ABS, float16_t> },
+        { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation<ActivationFunction::LINEAR, float16_t> },
+        { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, float16_t> },
+        { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, float16_t> },
+        { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, float16_t> },
+        { ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LU_BOUNDED_RELU, float16_t> },
+        { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU, float16_t> },
+        { ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT, float16_t> },
+        { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, float16_t> },
+        { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, float16_t> },
+    };
+#endif /* ARM_COMPUTE_ENABLE_FP16*/
+
     // Activation functions : QS8
     static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qs8 =
     {
@@ -79,32 +106,207 @@
         { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, qint8_t> },
         { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, qint8_t> },
         { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, qint8_t> },
+        { ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LU_BOUNDED_RELU, qint8_t> },
+        { ActivationFunction::LEAKY_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LEAKY_RELU, qint8_t> },
         { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU, qint8_t> },
         { ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT, qint8_t> },
         { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, qint8_t> },
         { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, qint8_t> },
     };
+    // Activation functions : QS16
+    static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qs16 =
+    {
+        { ActivationFunction::ABS, &NEActivationLayerKernel::activation<ActivationFunction::ABS, qint16_t> },
+        { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation<ActivationFunction::LINEAR, qint16_t> },
+        { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, qint16_t> },
+        { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, qint16_t> },
+        { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, qint16_t> },
+        { ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LU_BOUNDED_RELU, qint16_t> },
+        { ActivationFunction::LEAKY_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LEAKY_RELU, qint16_t> },
+        { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU, qint16_t> },
+        { ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT, qint16_t> },
+        { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, qint16_t> },
+        { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, qint16_t> },
+    };
 
-    _input    = input;
-    _output   = output;
-    _act_info = activation_info;
     switch(input->info()->data_type())
     {
-        case DataType::F32:
-            _func = act_map_f32[activation_info.activation()];
-            break;
         case DataType::QS8:
             _func = act_map_qs8[activation_info.activation()];
             break;
+        case DataType::QS16:
+            _func = act_map_qs16[activation_info.activation()];
+            break;
+        case DataType::F32:
+            _func = act_map_f32[activation_info.activation()];
+            break;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+        case DataType::F16:
+            _func = act_map_f16[activation_info.activation()];
+            break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
         default:
             ARM_COMPUTE_ERROR("Unsupported data type.");
     }
 
     constexpr unsigned int num_elems_processed_per_iteration = 16;
 
-    INESimpleKernel::configure(_input, _output, num_elems_processed_per_iteration);
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    if(output != nullptr)
+    {
+        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+        update_window_and_padding(win,
+                                  AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                                  output_access);
+
+        output_access.set_valid_region(win, input->info()->valid_region());
+    }
+    else
+    {
+        // In-place computation
+        update_window_and_padding(win,
+                                  AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+    }
+
+    ICPPKernel::configure(win);
 }
 
+#ifdef ARM_COMPUTE_ENABLE_FP16
+template <ActivationLayerInfo::ActivationFunction F, typename T>
+typename std::enable_if<std::is_same<T, float16_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
+{
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    static const float16x8_t CONST_0 = vdupq_n_f16(0.f);
+    static const float16x8_t CONST_1 = vdupq_n_f16(1.f);
+
+    const float16x8_t a = vdupq_n_f16(_act_info.a());
+    const float16x8_t b = vdupq_n_f16(_act_info.b());
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+        const float16x8x2_t in  = vld2q_f16(input_ptr);
+        float16x8x2_t       tmp = { {} };
+
+        switch(F)
+        {
+            case ActivationFunction::ABS:
+                tmp =
+                {
+                    {
+                        vabsq_f16(in.val[0]),
+                        vabsq_f16(in.val[1]),
+                    }
+                };
+                break;
+            case ActivationFunction::BOUNDED_RELU:
+                tmp =
+                {
+                    {
+                        vminq_f16(a, vmaxq_f16(CONST_0, in.val[0])),
+                        vminq_f16(a, vmaxq_f16(CONST_0, in.val[1]))
+                    }
+                };
+                break;
+            case ActivationFunction::LU_BOUNDED_RELU:
+                tmp =
+                {
+                    {
+                        vminq_f16(a, vmaxq_f16(b, in.val[0])),
+                        vminq_f16(a, vmaxq_f16(b, in.val[1]))
+                    }
+                };
+                break;
+            case ActivationFunction::LINEAR:
+                tmp =
+                {
+                    {
+                        vaddq_f16(b, vmulq_f16(a, in.val[0])),
+                        vaddq_f16(b, vmulq_f16(a, in.val[1]))
+                    }
+                };
+                break;
+            case ActivationFunction::LOGISTIC:
+                tmp =
+                {
+                    {
+                        vinvq_f16(vaddq_f16(CONST_1, vexpq_f16(vnegq_f16(in.val[0])))),
+                        vinvq_f16(vaddq_f16(CONST_1, vexpq_f16(vnegq_f16(in.val[1])))),
+                    }
+                };
+                break;
+            case ActivationFunction::RELU:
+                tmp =
+                {
+                    {
+                        vmaxq_f16(CONST_0, in.val[0]),
+                        vmaxq_f16(CONST_0, in.val[1])
+                    }
+                };
+                break;
+            case ActivationFunction::LEAKY_RELU:
+                tmp =
+                {
+                    {
+                        vbslq_f16(vcgtq_f16(in.val[0], CONST_0), in.val[0], vmulq_f16(a, in.val[0])),
+                        vbslq_f16(vcgtq_f16(in.val[1], CONST_0), in.val[1], vmulq_f16(a, in.val[1]))
+                    }
+                };
+                break;
+            case ActivationFunction::SOFT_RELU:
+                tmp =
+                {
+                    {
+                        vlogq_f16(vaddq_f16(CONST_1, vexpq_f16(in.val[0]))),
+                        vlogq_f16(vaddq_f16(CONST_1, vexpq_f16(in.val[1]))),
+                    }
+                };
+                break;
+            case ActivationFunction::SQRT:
+                tmp =
+                {
+                    {
+                        vinvq_f16(vinvsqrtq_f16(in.val[0])),
+                        vinvq_f16(vinvsqrtq_f16(in.val[1])),
+                    }
+                };
+                break;
+            case ActivationFunction::SQUARE:
+                tmp =
+                {
+                    {
+                        vmulq_f16(in.val[0], in.val[0]),
+                        vmulq_f16(in.val[1], in.val[1])
+                    }
+                };
+                break;
+            case ActivationFunction::TANH:
+                tmp =
+                {
+                    {
+                        vmulq_f16(a, vtanhq_f16(vmulq_f16(b, in.val[0]))),
+                        vmulq_f16(a, vtanhq_f16(vmulq_f16(b, in.val[1]))),
+                    }
+                };
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Not implemented");
+                break;
+        }
+
+        vst2q_f16(output_ptr, tmp);
+    },
+    input, output);
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
 template <ActivationLayerInfo::ActivationFunction F, typename T>
 typename std::enable_if<std::is_same<T, float>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
 {
@@ -137,17 +339,6 @@
                     }
                 };
                 break;
-            case ActivationFunction::BOUNDED_RELU:
-                tmp =
-                {
-                    {
-                        vminq_f32(a, vmaxq_f32(CONST_0, in.val[0])),
-                        vminq_f32(a, vmaxq_f32(CONST_0, in.val[1])),
-                        vminq_f32(a, vmaxq_f32(CONST_0, in.val[2])),
-                        vminq_f32(a, vmaxq_f32(CONST_0, in.val[3])),
-                    }
-                };
-                break;
             case ActivationFunction::LINEAR:
                 tmp =
                 {
@@ -181,6 +372,39 @@
                     }
                 };
                 break;
+            case ActivationFunction::BOUNDED_RELU:
+                tmp =
+                {
+                    {
+                        vminq_f32(a, vmaxq_f32(CONST_0, in.val[0])),
+                        vminq_f32(a, vmaxq_f32(CONST_0, in.val[1])),
+                        vminq_f32(a, vmaxq_f32(CONST_0, in.val[2])),
+                        vminq_f32(a, vmaxq_f32(CONST_0, in.val[3])),
+                    }
+                };
+                break;
+            case ActivationFunction::LU_BOUNDED_RELU:
+                tmp =
+                {
+                    {
+                        vminq_f32(a, vmaxq_f32(b, in.val[0])),
+                        vminq_f32(a, vmaxq_f32(b, in.val[1])),
+                        vminq_f32(a, vmaxq_f32(b, in.val[2])),
+                        vminq_f32(a, vmaxq_f32(b, in.val[3])),
+                    }
+                };
+                break;
+            case ActivationFunction::LEAKY_RELU:
+                tmp =
+                {
+                    {
+                        vbslq_f32(vcgtq_f32(in.val[0], CONST_0), in.val[0], vmulq_f32(a, in.val[0])),
+                        vbslq_f32(vcgtq_f32(in.val[1], CONST_0), in.val[1], vmulq_f32(a, in.val[1])),
+                        vbslq_f32(vcgtq_f32(in.val[2], CONST_0), in.val[2], vmulq_f32(a, in.val[2])),
+                        vbslq_f32(vcgtq_f32(in.val[3], CONST_0), in.val[3], vmulq_f32(a, in.val[3])),
+                    }
+                };
+                break;
             case ActivationFunction::SOFT_RELU:
                 tmp =
                 {
@@ -237,14 +461,14 @@
 template <ActivationLayerInfo::ActivationFunction F, typename T>
 typename std::enable_if<std::is_same<T, int8_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
 {
-    Iterator input(_input, window);
-    Iterator output(_output, window);
-    int      fixed_point_position = _input->info()->fixed_point_position();
+    Iterator  input(_input, window);
+    Iterator  output(_output, window);
+    const int fixed_point_position = _input->info()->fixed_point_position();
 
     static const qint8x16_t CONST_0 = vdupq_n_qs8(0);
-    const qint8x16_t        CONST_1 = vdupq_n_qs8(scvt_qs8_f32(1.f, fixed_point_position));
-    const qint8x16_t        a       = vdupq_n_qs8(scvt_qs8_f32(_act_info.a(), fixed_point_position));
-    const qint8x16_t        b       = vdupq_n_qs8(scvt_qs8_f32(_act_info.b(), fixed_point_position));
+    const qint8x16_t        CONST_1 = vdupq_n_qs8(sqcvt_qs8_f32(1.f, fixed_point_position));
+    const qint8x16_t        a       = vdupq_n_qs8(sqcvt_qs8_f32(_act_info.a(), fixed_point_position));
+    const qint8x16_t        b       = vdupq_n_qs8(sqcvt_qs8_f32(_act_info.b(), fixed_point_position));
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
@@ -259,29 +483,35 @@
             case ActivationFunction::ABS:
                 tmp = vqabsq_qs8(in);
                 break;
-            case ActivationFunction::BOUNDED_RELU:
-                tmp = vminq_qs8(a, vmaxq_qs8(CONST_0, in));
-                break;
             case ActivationFunction::LINEAR:
                 tmp = vqmlaq_qs8(b, a, in, fixed_point_position);
                 break;
             case ActivationFunction::LOGISTIC:
-                tmp = vrecipq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(vnegq_s8(in), fixed_point_position)), fixed_point_position);
+                tmp = vqrecipq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(vnegq_s8(in), fixed_point_position)), fixed_point_position);
                 break;
             case ActivationFunction::RELU:
                 tmp = vmaxq_qs8(CONST_0, in);
                 break;
+            case ActivationFunction::BOUNDED_RELU:
+                tmp = vminq_qs8(a, vmaxq_qs8(CONST_0, in));
+                break;
+            case ActivationFunction::LU_BOUNDED_RELU:
+                tmp = vminq_qs8(a, vmaxq_qs8(b, in));
+                break;
+            case ActivationFunction::LEAKY_RELU:
+                tmp = vbslq_s8(vcgtq_s8(in, CONST_0), in, vmulq_qs8(a, in, fixed_point_position));
+                break;
             case ActivationFunction::SOFT_RELU:
                 tmp = vlogq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(in, fixed_point_position)), fixed_point_position);
                 break;
             case ActivationFunction::SQRT:
-                tmp = vrecipq_qs8(vinvsqrtq_qs8(in, fixed_point_position), fixed_point_position);
+                tmp = vqrecipq_qs8(vqinvsqrtq_qs8(in, fixed_point_position), fixed_point_position);
                 break;
             case ActivationFunction::SQUARE:
                 tmp = vqmulq_qs8(in, in, fixed_point_position);
                 break;
             case ActivationFunction::TANH:
-                tmp = vtanhq_qs8(in, fixed_point_position);
+                tmp = vqmulq_qs8(a, vqtanhq_qs8(vqmulq_qs8(b, in, fixed_point_position), fixed_point_position), fixed_point_position);
                 break;
             default:
                 break;
@@ -292,10 +522,142 @@
     input, output);
 }
 
-void NEActivationLayerKernel::run(const Window &window)
+template <ActivationLayerInfo::ActivationFunction F, typename T>
+typename std::enable_if<std::is_same<T, qint16_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
 {
+    Iterator  input(_input, window);
+    Iterator  output(_output, window);
+    const int fixed_point_position = _input->info()->fixed_point_position();
+
+    static const qint16x8_t CONST_0 = vdupq_n_qs16(0);
+    const qint16x8_t        CONST_1 = vdupq_n_qs16(sqcvt_qs16_f32(1.f, fixed_point_position));
+    const qint16x8_t        a       = vdupq_n_qs16(sqcvt_qs16_f32(_act_info.a(), fixed_point_position));
+    const qint16x8_t        b       = vdupq_n_qs16(sqcvt_qs16_f32(_act_info.b(), fixed_point_position));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto input_ptr  = reinterpret_cast<const int16_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+        const qint16x8x2_t in  = vld2q_s16(input_ptr);
+        qint16x8x2_t       tmp = { {} };
+
+        switch(F)
+        {
+            case ActivationFunction::ABS:
+                tmp =
+                {
+                    {
+                        vqabsq_qs16(in.val[0]),
+                        vqabsq_qs16(in.val[1]),
+                    }
+                };
+                break;
+            case ActivationFunction::LINEAR:
+                tmp =
+                {
+                    {
+                        vqmlaq_qs16(b, a, in.val[0], fixed_point_position),
+                        vqmlaq_qs16(b, a, in.val[1], fixed_point_position),
+                    }
+                };
+                break;
+            case ActivationFunction::LOGISTIC:
+                tmp =
+                {
+                    {
+                        vqrecipq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(vnegq_s16(in.val[0]), fixed_point_position)), fixed_point_position),
+                        vqrecipq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(vnegq_s16(in.val[1]), fixed_point_position)), fixed_point_position),
+                    }
+                };
+                break;
+            case ActivationFunction::RELU:
+                tmp =
+                {
+                    {
+                        vmaxq_qs16(CONST_0, in.val[0]),
+                        vmaxq_qs16(CONST_0, in.val[1]),
+                    }
+                };
+                break;
+            case ActivationFunction::BOUNDED_RELU:
+                tmp =
+                {
+                    {
+                        vminq_qs16(a, vmaxq_qs16(CONST_0, in.val[0])),
+                        vminq_qs16(a, vmaxq_qs16(CONST_0, in.val[1])),
+                    }
+                };
+                break;
+            case ActivationFunction::LU_BOUNDED_RELU:
+                tmp =
+                {
+                    {
+                        vminq_qs16(a, vmaxq_qs16(b, in.val[0])),
+                        vminq_qs16(a, vmaxq_qs16(b, in.val[1])),
+                    }
+                };
+                break;
+            case ActivationFunction::LEAKY_RELU:
+                tmp =
+                {
+                    {
+                        vbslq_s16(vcgtq_s16(in.val[0], CONST_0), in.val[0], vmulq_qs16(a, in.val[0], fixed_point_position)),
+                        vbslq_s16(vcgtq_s16(in.val[1], CONST_0), in.val[1], vmulq_qs16(a, in.val[1], fixed_point_position)),
+                    }
+                };
+                break;
+            case ActivationFunction::SOFT_RELU:
+                tmp =
+                {
+                    {
+                        vlogq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(in.val[0], fixed_point_position)), fixed_point_position),
+                        vlogq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(in.val[1], fixed_point_position)), fixed_point_position),
+                    }
+                };
+                break;
+            case ActivationFunction::SQRT:
+                tmp =
+                {
+                    {
+                        vqrecipq_qs16(vqinvsqrtq_qs16(in.val[0], fixed_point_position), fixed_point_position),
+                        vqrecipq_qs16(vqinvsqrtq_qs16(in.val[1], fixed_point_position), fixed_point_position),
+                    }
+                };
+                break;
+            case ActivationFunction::SQUARE:
+                tmp =
+                {
+                    {
+                        vqmulq_qs16(in.val[0], in.val[0], fixed_point_position),
+                        vqmulq_qs16(in.val[1], in.val[1], fixed_point_position),
+                    }
+                };
+                break;
+            case ActivationFunction::TANH:
+                tmp =
+                {
+                    {
+                        vqmulq_qs16(a, vqtanhq_qs16(vqmulq_qs16(b, in.val[0], fixed_point_position), fixed_point_position), fixed_point_position),
+                        vqmulq_qs16(a, vqtanhq_qs16(vqmulq_qs16(b, in.val[1], fixed_point_position), fixed_point_position), fixed_point_position),
+                    }
+                };
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Function not implemented");
+                break;
+        }
+
+        vst2q_qs16(output_ptr, tmp);
+    },
+    input, output);
+}
+
+void NEActivationLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
     (this->*_func)(window);

diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
index a4fdad8..f263fd0 100644
--- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 
@@ -45,6 +46,38 @@
 
 namespace
 {
+void add_wrap_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const qint8x16_t a = vld1q_qs8(reinterpret_cast<const qint8_t *>(input1.ptr()));
+        const qint8x16_t b = vld1q_qs8(reinterpret_cast<const qint8_t *>(input2.ptr()));
+
+        vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vaddq_qs8(a, b));
+    },
+    input1, input2, output);
+}
+
+void add_saturate_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const qint8x16_t a = vld1q_qs8(reinterpret_cast<const qint8_t *>(input1.ptr()));
+        const qint8x16_t b = vld1q_qs8(reinterpret_cast<const qint8_t *>(input2.ptr()));
+
+        vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqaddq_qs8(a, b));
+    },
+    input1, input2, output);
+}
+
 void add_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     Iterator input1(in1, window);
@@ -112,6 +145,45 @@
     return res;
 }
 
+#ifdef ARM_COMPUTE_ENABLE_FP16
+inline float16x8x2_t vadd2q_f16(const float16x8x2_t &a, const float16x8x2_t &b)
+{
+    const float16x8x2_t res =
+    {
+        {
+            vaddq_f16(a.val[0], b.val[0]),
+            vaddq_f16(a.val[1], b.val[1])
+        }
+    };
+
+    return res;
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
+void add_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const float16x8x2_t a = vld2q_f16(reinterpret_cast<const float16_t *>(input1.ptr()));
+        const float16x8x2_t b = vld2q_f16(reinterpret_cast<const float16_t *>(input2.ptr()));
+
+        vst2q_f16(reinterpret_cast<float16_t *>(output.ptr()), vadd2q_f16(a, b));
+    },
+    input1, input2, output);
+#else  /* ARM_COMPUTE_ENABLE_FP16 */
+    ARM_COMPUTE_UNUSED(in1);
+    ARM_COMPUTE_UNUSED(in2);
+    ARM_COMPUTE_UNUSED(out);
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_ERROR("Not supported, recompile the library with arch=arm64-v8.2-a");
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+}
+
 void add_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     Iterator input1(in1, window);
@@ -294,26 +366,40 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
 
-    set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+    // Auto initialize output if not initialized
+    {
+        set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
 
-    if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
-    {
-        set_format_if_unknown(*output->info(), Format::S16);
-    }
-    else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
-    {
-        set_format_if_unknown(*output->info(), Format::F32);
+        if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+        {
+            set_format_if_unknown(*output->info(), Format::S16);
+        }
+        else if(input1->info()->data_type() == DataType::F16 || input2->info()->data_type() == DataType::F16)
+        {
+            set_format_if_unknown(*output->info(), Format::F16);
+        }
+        else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+        {
+            set_format_if_unknown(*output->info(), Format::F32);
+        }
     }
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
                              "Output can only be U8 if both inputs are U8");
+    if(is_data_type_fixed_point(input1->info()->data_type()) || is_data_type_fixed_point(input2->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type()))
+    {
+        // Check that all data types are the same and all fixed-point positions are the same
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
+    }
 
     static std::map<std::string, AddFunction *> map_function =
     {
+        { "add_wrap_QS8_QS8_QS8", &add_wrap_QS8_QS8_QS8 },
+        { "add_saturate_QS8_QS8_QS8", &add_saturate_QS8_QS8_QS8 },
         { "add_wrap_U8_U8_U8", &add_wrap_U8_U8_U8 },
         { "add_saturate_U8_U8_U8", &add_saturate_U8_U8_U8 },
         { "add_wrap_S16_U8_S16", &add_wrap_S16_U8_S16 },
@@ -322,10 +408,15 @@
         { "add_saturate_U8_S16_S16", &add_saturate_U8_S16_S16 },
         { "add_wrap_U8_U8_S16", &add_wrap_U8_U8_S16 },
         { "add_saturate_U8_U8_S16", &add_saturate_U8_U8_S16 },
+        { "add_wrap_QS16_QS16_QS16", &add_wrap_S16_S16_S16 },
+        { "add_saturate_QS16_QS16_QS16", &add_saturate_S16_S16_S16 },
         { "add_wrap_S16_S16_S16", &add_wrap_S16_S16_S16 },
         { "add_saturate_S16_S16_S16", &add_saturate_S16_S16_S16 },
         { "add_wrap_F32_F32_F32", &add_F32_F32_F32 },
         { "add_saturate_F32_F32_F32", &add_F32_F32_F32 },
+        { "add_wrap_F16_F16_F16", &add_F16_F16_F16 },
+        { "add_saturate_F16_F16_F16", &add_F16_F16_F16 },
+
     };
 
     _input1 = input1;
@@ -368,8 +459,9 @@
     INEKernel::configure(win);
 }
 
-void NEArithmeticAdditionKernel::run(const Window &window)
+void NEArithmeticAdditionKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);

diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
index d3e62b0..85f72c1 100644
--- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 
@@ -44,6 +45,38 @@
 
 namespace
 {
+void sub_wrap_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const qint8x16_t a = vld1q_qs8(reinterpret_cast<const qint8_t *>(input1.ptr()));
+        const qint8x16_t b = vld1q_qs8(reinterpret_cast<const qint8_t *>(input2.ptr()));
+
+        vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vsubq_qs8(a, b));
+    },
+    input1, input2, output);
+}
+
+void sub_saturate_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const qint8x16_t a = vld1q_qs8(reinterpret_cast<const qint8_t *>(input1.ptr()));
+        const qint8x16_t b = vld1q_qs8(reinterpret_cast<const qint8_t *>(input2.ptr()));
+
+        vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqsubq_qs8(a, b));
+    },
+    input1, input2, output);
+}
+
 void sub_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     Iterator input1(in1, window);
@@ -124,6 +157,45 @@
     input1, input2, output);
 }
 
+#ifdef ARM_COMPUTE_ENABLE_FP16
+inline float16x8x2_t vsub2q_f16(const float16x8x2_t &a, const float16x8x2_t &b)
+{
+    const float16x8x2_t res =
+    {
+        {
+            vsubq_f16(a.val[0], b.val[0]),
+            vsubq_f16(a.val[1], b.val[1])
+        }
+    };
+
+    return res;
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
+void sub_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const float16x8x2_t a = vld2q_f16(reinterpret_cast<const float16_t *>(input1.ptr()));
+        const float16x8x2_t b = vld2q_f16(reinterpret_cast<const float16_t *>(input2.ptr()));
+
+        vst2q_f16(reinterpret_cast<float16_t *>(output.ptr()), vsub2q_f16(a, b));
+    },
+    input1, input2, output);
+#else  /* ARM_COMPUTE_ENABLE_FP16 */
+    ARM_COMPUTE_UNUSED(in1);
+    ARM_COMPUTE_UNUSED(in2);
+    ARM_COMPUTE_UNUSED(out);
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_ERROR("Not supported, recompile the library with arch=arm64-v8.2-a");
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+}
+
 void sub_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     Iterator input1(in1, window);
@@ -287,26 +359,40 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
 
-    set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+    // Auto initialize output if not initialized
+    {
+        set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
 
-    if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
-    {
-        set_format_if_unknown(*output->info(), Format::S16);
-    }
-    else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
-    {
-        set_format_if_unknown(*output->info(), Format::F32);
+        if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+        {
+            set_format_if_unknown(*output->info(), Format::S16);
+        }
+        else if(input1->info()->data_type() == DataType::F16 || input2->info()->data_type() == DataType::F16)
+        {
+            set_format_if_unknown(*output->info(), Format::F16);
+        }
+        else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+        {
+            set_format_if_unknown(*output->info(), Format::F32);
+        }
     }
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
                              "Output can only be U8 if both inputs are U8");
+    if(is_data_type_fixed_point(input1->info()->data_type()) || is_data_type_fixed_point(input2->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type()))
+    {
+        // Check that all data types are the same and all fixed-point positions are the same
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
+    }
 
     static std::map<std::string, SubFunction *> map_function =
     {
+        { "sub_wrap_QS8_QS8_QS8", &sub_wrap_QS8_QS8_QS8 },
+        { "sub_saturate_QS8_QS8_QS8", &sub_saturate_QS8_QS8_QS8 },
         { "sub_wrap_U8_U8_U8", &sub_wrap_U8_U8_U8 },
         { "sub_wrap_U8_U8_S16", &sub_wrap_U8_U8_S16 },
         { "sub_saturate_U8_U8_U8", &sub_saturate_U8_U8_U8 },
@@ -315,10 +401,15 @@
         { "sub_wrap_S16_U8_S16", &sub_wrap_S16_U8_S16 },
         { "sub_saturate_U8_S16_S16", &sub_saturate_U8_S16_S16 },
         { "sub_saturate_S16_U8_S16", &sub_saturate_S16_U8_S16 },
+        { "sub_wrap_QS16_QS16_QS16", &sub_wrap_S16_S16_S16 },
+        { "sub_saturate_QS16_QS16_QS16", &sub_saturate_S16_S16_S16 },
         { "sub_wrap_S16_S16_S16", &sub_wrap_S16_S16_S16 },
         { "sub_saturate_S16_S16_S16", &sub_saturate_S16_S16_S16 },
         { "sub_wrap_F32_F32_F32", &sub_F32_F32_F32 },
         { "sub_saturate_F32_F32_F32", &sub_F32_F32_F32 },
+        { "sub_wrap_F16_F16_F16", &sub_F16_F16_F16 },
+        { "sub_saturate_F16_F16_F16", &sub_F16_F16_F16 },
+
     };
 
     _input1 = input1;
@@ -361,8 +452,9 @@
     INEKernel::configure(win);
 }
 
-void NEArithmeticSubtractionKernel::run(const Window &window)
+void NEArithmeticSubtractionKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);

diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index 9a216ae..f6f6f9c 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp

@@ -38,7 +38,7 @@
 {
 }
 
-void batch_normalization_q8(const ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
+void batch_normalization_q8(ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
 {
     Iterator input(in, window);
     Iterator output(out, window);
@@ -47,7 +47,7 @@
     // Only compute denominator and NEON vectors once per feature map.
     int slice = -1;
 
-    int        fixed_point_position = in->info()->fixed_point_position();
+    const int  fixed_point_position = in->info()->fixed_point_position();
     const auto input_mean           = reinterpret_cast<const qint8_t *>(mean->ptr_to_element(Coordinates(0, 0)));
     const auto input_var            = reinterpret_cast<const qint8_t *>(var->ptr_to_element(Coordinates(0, 0)));
     const auto input_gamma          = reinterpret_cast<const qint8_t *>(gamma->ptr_to_element(Coordinates(0, 0)));
@@ -58,7 +58,7 @@
     qint8x16_t       gamma_vec   = vdupq_n_qs8(0);
     qint8x16_t       beta_vec    = vdupq_n_qs8(0);
     qint8x16_t       denominator = vdupq_n_qs8(0);
-    const qint8x16_t epsilon_vec = vdupq_n_qs8(scvt_qs8_f32(epsilon, fixed_point_position));
+    const qint8x16_t epsilon_vec = vdupq_n_qs8(sqcvt_qs8_f32(epsilon, fixed_point_position));
     execute_window_loop(window, [&](const Coordinates & id)
     {
         if(slice != id.z())
@@ -82,7 +82,51 @@
     input, output);
 }
 
-void batch_normalization_fp32(const ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
+void batch_normalization_q16(ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
+{
+    Iterator input(in, window);
+    Iterator output(out, window);
+
+    // Hold information about the current feature map we are iterating.
+    // Only compute denominator and NEON vectors once per feature map.
+    int slice = -1;
+
+    const int  fixed_point_position = in->info()->fixed_point_position();
+    const auto input_mean           = reinterpret_cast<const qint16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var            = reinterpret_cast<const qint16_t *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma          = reinterpret_cast<const qint16_t *>(gamma->ptr_to_element(Coordinates(0, 0)));
+    const auto input_beta           = reinterpret_cast<const qint16_t *>(beta->ptr_to_element(Coordinates(0, 0)));
+
+    qint16x8_t       mean_vec    = vdupq_n_qs16(0);
+    qint16x8_t       var_vec     = vdupq_n_qs16(0);
+    qint16x8_t       gamma_vec   = vdupq_n_qs16(0);
+    qint16x8_t       beta_vec    = vdupq_n_qs16(0);
+    qint16x8_t       denominator = vdupq_n_qs16(0);
+    const qint16x8_t epsilon_vec = vdupq_n_qs16(sqcvt_qs16_f32(epsilon, fixed_point_position));
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        if(slice != id.z())
+        {
+            // Conctruct vectors
+            mean_vec  = vdupq_n_qs16(*(input_mean + id.z()));
+            var_vec   = vdupq_n_qs16(*(input_var + id.z()));
+            gamma_vec = vdupq_n_qs16(*(input_gamma + id.z()));
+            beta_vec  = vdupq_n_qs16(*(input_beta + id.z()));
+
+            // Calculate denominator
+            denominator = vqinvsqrtq_qs16(vqaddq_qs16(var_vec, epsilon_vec), fixed_point_position);
+            slice       = id.z();
+        }
+
+        // Calculate x bar and store results
+        const qint16x8_t numerator = vqsubq_qs16(vld1q_qs16(reinterpret_cast<const qint16_t *>(input.ptr())), mean_vec);
+        const qint16x8_t x_bar     = vqmulq_qs16(numerator, denominator, fixed_point_position);
+        vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmlaq_qs16(beta_vec, x_bar, gamma_vec, fixed_point_position));
+    },
+    input, output);
+}
+
+void batch_normalization_fp32(ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
 {
     Iterator input(in, window);
     Iterator output(out, window);
@@ -125,29 +169,78 @@
     input, output);
 }
 
-void NEBatchNormalizationLayerKernel::configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
+#ifdef ARM_COMPUTE_ENABLE_FP16
+void batch_normalization_fp16(ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mean, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(var, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gamma, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(beta, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    Iterator input(in, window);
+    Iterator output(out, window);
+
+    // Hold information about the current feature map we are iterating.
+    // Only compute denominator and NEON vectors once per feature map.
+    int slice = -1;
+
+    const auto input_mean  = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var   = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma = reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0)));
+    const auto input_beta  = reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0)));
+
+    float16x8_t       mean_vec    = vdupq_n_f16(0.0);
+    float16x8_t       var_vec     = vdupq_n_f16(0.0);
+    float16x8_t       gamma_vec   = vdupq_n_f16(0.0);
+    float16x8_t       beta_vec    = vdupq_n_f16(0.0);
+    float16x8_t       denominator = vdupq_n_f16(0.0);
+    const float16x8_t epsilon_vec = vdupq_n_f16(epsilon);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        if(slice != id.z())
+        {
+            // Conctruct vectors
+            mean_vec  = vdupq_n_f16(*(input_mean + id.z()));
+            var_vec   = vdupq_n_f16(*(input_var + id.z()));
+            gamma_vec = vdupq_n_f16(*(input_gamma + id.z()));
+            beta_vec  = vdupq_n_f16(*(input_beta + id.z()));
+
+            // Calculate denominator
+            denominator = vinvsqrtq_f16(vaddq_f16(var_vec, epsilon_vec));
+            slice       = id.z();
+        }
+
+        // Calculate x bar and store results
+        const float16x8_t numerator = vsubq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), mean_vec);
+        const float16x8_t x_bar     = vmulq_f16(numerator, denominator);
+        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec)));
+    },
+    input, output);
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
+void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
 
     _input   = input;
-    _output  = output;
+    _output  = input;
     _mean    = mean;
     _var     = var;
     _gamma   = gamma;
     _beta    = beta;
     _epsilon = epsilon;
 
+    if(output != nullptr)
+    {
+        // Output tensor auto initialization if not yet initialized
+        auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+        _output = output;
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, mean, var, beta, gamma);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, mean, var, beta, gamma);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var, beta, gamma);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != mean->info()->dimension(0));
+
     unsigned int num_elems_processed_per_iteration = 0;
 
     switch(input->info()->data_type())
@@ -156,29 +249,43 @@
             _func                             = &batch_normalization_q8;
             num_elems_processed_per_iteration = 16;
             break;
+        case DataType::QS16:
+            _func                             = &batch_normalization_q16;
+            num_elems_processed_per_iteration = 8;
+            break;
         case DataType::F32:
             _func                             = &batch_normalization_fp32;
             num_elems_processed_per_iteration = 4;
             break;
+        case DataType::F16:
+#ifdef ARM_COMPUTE_ENABLE_FP16
+            _func                             = &batch_normalization_fp16;
+            num_elems_processed_per_iteration = 8;
+            break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
         default:
             ARM_COMPUTE_ERROR("Element size not supported");
             break;
     }
 
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region());
-
+    if(output != nullptr)
+    {
+        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+        update_window_and_padding(win, input_access, output_access);
+        output_access.set_valid_region(win, input->info()->valid_region());
+    }
+    else
+    {
+        update_window_and_padding(win, input_access);
+    }
     INEKernel::configure(win);
 }
 
-void NEBatchNormalizationLayerKernel::run(const Window &window)
+void NEBatchNormalizationLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);

diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
index e8e448e..3888300 100644
--- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp

@@ -93,8 +93,9 @@
     INEKernel::configure(win);
 }
 
-void NEBitwiseAndKernel::run(const Window &window)
+void NEBitwiseAndKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     Iterator input1(_input1, window);

diff --git a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
index bf75592..08d7fe2 100644
--- a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp

@@ -81,8 +81,9 @@
     INEKernel::configure(win);
 }
 
-void NEBitwiseNotKernel::run(const Window &window)
+void NEBitwiseNotKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     Iterator input(_input, window);

diff --git a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
index f184be2..1b17cc2 100644
--- a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp

@@ -93,8 +93,9 @@
     INEKernel::configure(win);
 }
 
-void NEBitwiseOrKernel::run(const Window &window)
+void NEBitwiseOrKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     Iterator input1(_input1, window);

diff --git a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
index c4fb4c0..9451e8a 100644
--- a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp

@@ -89,8 +89,9 @@
     INEKernel::configure(win);
 }
 
-void NEBitwiseXorKernel::run(const Window &window)
+void NEBitwiseXorKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     Iterator input1(_input1, window);

diff --git a/src/core/NEON/kernels/NEBox3x3Kernel.cpp b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
index d7e6d73..d7178e4 100644
--- a/src/core/NEON/kernels/NEBox3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEBox3x3Kernel.cpp

@@ -34,8 +34,9 @@
 using namespace arm_compute;
 
 #ifdef ARM_COMPUTE_ENABLE_FP16
-void NEBox3x3FP16Kernel::run(const Window &window)
+void NEBox3x3FP16Kernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
 
@@ -103,7 +104,7 @@
     },
     input, output);
 }
-#endif
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
 
 BorderSize NEBox3x3Kernel::border_size() const
 {
@@ -144,8 +145,9 @@
     INEKernel::configure(win);
 }
 
-void NEBox3x3Kernel::run(const Window &window)
+void NEBox3x3Kernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/NECannyEdgeKernel.cpp b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
index 85a2cd5..bcbe790 100644
--- a/src/core/NEON/kernels/NECannyEdgeKernel.cpp
+++ b/src/core/NEON/kernels/NECannyEdgeKernel.cpp

@@ -787,7 +787,7 @@
 
     INEKernel::configure(win);
 }
-#endif
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
 
 namespace
 {
@@ -1677,8 +1677,9 @@
     INEKernel::configure(win);
 }
 
-void NEGradientKernel::run(const Window &window)
+void NEGradientKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
@@ -1758,8 +1759,9 @@
     INEKernel::configure(win);
 }
 
-void NEEdgeNonMaxSuppressionKernel::run(const Window &window)
+void NEEdgeNonMaxSuppressionKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
@@ -1838,8 +1840,9 @@
     INEKernel::configure(win);
 }
 
-void NEEdgeTraceKernel::run(const Window &window)
+void NEEdgeTraceKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     Iterator input(_input, window);

diff --git a/src/core/NEON/kernels/NEChannelCombineKernel.cpp b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
index 3147a69..a2b24de 100644
--- a/src/core/NEON/kernels/NEChannelCombineKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelCombineKernel.cpp

@@ -284,8 +284,9 @@
     return _is_parallelizable;
 }
 
-void NEChannelCombineKernel::run(const Window &window)
+void NEChannelCombineKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);

diff --git a/src/core/NEON/kernels/NEChannelExtractKernel.cpp b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
index ebc4b85..bac2471 100644
--- a/src/core/NEON/kernels/NEChannelExtractKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelExtractKernel.cpp

@@ -148,12 +148,11 @@
     _input  = input;
     _output = output;
 
-    Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowRectangle output_access(input->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / subsampling, 1.f / subsampling);
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / subsampling, 1.f / subsampling);
 
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
-                              output_access);
+    update_window_and_padding(win, input_access, output_access);
 
     ValidRegion input_valid_region = input->info()->valid_region();
 
@@ -257,16 +256,17 @@
 
     _output                    = output;
     Window                 win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
     AccessWindowHorizontal input_access(_input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
     update_window_and_padding(win, input_access, output_access);
     output_access.set_valid_region(win, _input->info()->valid_region());
 
     INEKernel::configure(win);
 }
 
-void NEChannelExtractKernel::run(const Window &window)
+void NEChannelExtractKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);

diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp
index 6d370ac..460d37e 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.cpp
+++ b/src/core/NEON/kernels/NECol2ImKernel.cpp

@@ -69,20 +69,21 @@
 
 void NECol2ImKernel::configure(const ITensor *input, ITensor *output, std::pair<unsigned int, unsigned int> convolved_dims)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    set_data_type_if_unknown(*output->info(), input->info()->data_type());
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16,
+                                                  DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
     TensorShape output_shape = input->info()->tensor_shape();
     output_shape.set(0, convolved_dims.first);
     output_shape.set(1, convolved_dims.second);
     output_shape.set(2, input->info()->tensor_shape()[0]);
 
-    set_shape_if_empty(*output->info(), output_shape);
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
 
     _input          = input;
     _output         = output;
@@ -115,8 +116,9 @@
     INEKernel::configure(win);
 }
 
-void NECol2ImKernel::run(const Window &window)
+void NECol2ImKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/NEColorConvertKernel.cpp b/src/core/NEON/kernels/NEColorConvertKernel.cpp
index cb5152e..347aeae 100644
--- a/src/core/NEON/kernels/NEColorConvertKernel.cpp
+++ b/src/core/NEON/kernels/NEColorConvertKernel.cpp

@@ -572,8 +572,9 @@
     INEKernel::configure(win);
 }
 
-void NEColorConvertKernel::run(const Window &window)
+void NEColorConvertKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);

diff --git a/src/core/NEON/kernels/NEConvolutionKernel.cpp b/src/core/NEON/kernels/NEConvolutionKernel.cpp
index 30e91ef..263fbe0 100644
--- a/src/core/NEON/kernels/NEConvolutionKernel.cpp
+++ b/src/core/NEON/kernels/NEConvolutionKernel.cpp

@@ -621,8 +621,9 @@
 }
 
 template <unsigned int matrix_size>
-void NEConvolutionKernel<matrix_size>::run(const Window &window)
+void NEConvolutionKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
@@ -694,8 +695,9 @@
 }
 
 template <unsigned int matrix_size>
-void NESeparableConvolutionHorKernel<matrix_size>::run(const Window &window)
+void NESeparableConvolutionHorKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     switch(_output->info()->data_type())
@@ -1131,8 +1133,9 @@
 }
 
 template <unsigned int matrix_size>
-void NESeparableConvolutionVertKernel<matrix_size>::run(const Window &window)
+void NESeparableConvolutionVertKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
@@ -1464,8 +1467,9 @@
     INEKernel::configure(win);
 }
 
-void NEConvolutionRectangleKernel::run(const Window &window)
+void NEConvolutionRectangleKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
index 32789cb..b65f3ba 100644
--- a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
+++ b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp

@@ -67,8 +67,9 @@
     INEKernel::configure(calculate_max_window(*input->info()));
 }
 
-void NECumulativeDistributionKernel::run(const Window &window)
+void NECumulativeDistributionKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_distribution->buffer() == nullptr);

diff --git a/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp
index 902490e..7a62b0c 100644
--- a/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp

@@ -27,17 +27,76 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
 #include <arm_neon.h>
+#include <cstdint>
 
 using namespace arm_compute;
 
+namespace
+{
+// Overloads of 128-bit vector loads
+uint8x16_t loadq(const uint8_t *ptr)
+{
+    return vld1q_u8(ptr);
+}
+uint16x8_t loadq(const uint16_t *ptr)
+{
+    return vld1q_u16(ptr);
+}
+uint32x4_t loadq(const uint32_t *ptr)
+{
+    return vld1q_u32(ptr);
+}
+// Overloads of 128-bit vector stores
+void storeq(uint8_t *ptr, uint8x16_t val)
+{
+    return vst1q_u8(ptr, val);
+}
+void storeq(uint16_t *ptr, uint16x8_t val)
+{
+    return vst1q_u16(ptr, val);
+}
+void storeq(uint32_t *ptr, uint32x4_t val)
+{
+    return vst1q_u32(ptr, val);
+}
+
+template <typename T>
+void depth_concat(const ITensor *in, ITensor *out, std::pair<int, int> start_xy, int depth_offset, const Window &window)
+{
+    const int start_x = start_xy.first;
+    const int start_y = start_xy.second;
+
+    // Offset input
+    const int input_offset_to_first_elements_in_bytes = in->info()->offset_first_element_in_bytes() - start_x * in->info()->strides_in_bytes()[0] - start_y * in->info()->strides_in_bytes()[1];
+    uint8_t *input_ptr                               = in->buffer() + input_offset_to_first_elements_in_bytes;
+
+    // Offset output
+    const unsigned int output_offset_to_first_elements_in_bytes = out->info()->offset_first_element_in_bytes() + depth_offset * out->info()->strides_in_bytes()[2];
+    uint8_t           *output_ptr                               = out->buffer() + output_offset_to_first_elements_in_bytes;
+
+    Iterator input(in, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = reinterpret_cast<const T *>(input_ptr + input.offset());
+        const auto out_ptr = reinterpret_cast<T *>(output_ptr + output.offset());
+
+        storeq(out_ptr, loadq(in_ptr));
+    },
+    input, output);
+}
+} // namespace
+
 NEDepthConcatenateKernel::NEDepthConcatenateKernel()
-    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
+    : _func(nullptr), _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
 {
 }
 
@@ -48,8 +107,9 @@
 
 void NEDepthConcatenateKernel::configure(const ITensor *input, unsigned int depth_offset, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
@@ -60,18 +120,36 @@
     ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
     ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
 
+    _func         = nullptr;
     _input        = input;
     _output       = output;
     _depth_offset = depth_offset;
     _left_right   = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
     _top_bottom   = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
 
-    const unsigned int num_elems_processed_per_iteration = 4;
-    const unsigned int num_elems_read_per_iteration      = 4;
+    switch(input->info()->data_type())
+    {
+        case DataType::QS8:
+            _func = &depth_concat<uint8_t>;
+            break;
+        case DataType::QS16:
+        case DataType::F16:
+            _func = &depth_concat<uint16_t>;
+            break;
+        case DataType::F32:
+            _func = &depth_concat<uint32_t>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
+
+    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+    const unsigned int num_elems_read_per_iteration      = 16 / input->info()->element_size();
     const unsigned int num_rows_read_per_iteration       = 1;
 
     // The window needs to be based on input as we copy all the depths of input
-    Window win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_processed_per_iteration), border_size());
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1));
 
     AccessWindowRectangle  input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
@@ -81,25 +159,12 @@
     INEKernel::configure(win);
 }
 
-void NEDepthConcatenateKernel::run(const Window &window)
+void NEDepthConcatenateKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    // Offset output
-    const unsigned int offset_to_first_elements_in_bytes = _output->info()->offset_first_element_in_bytes() + _left_right * _output->info()->strides_in_bytes()[0] + _top_bottom *
-                                                           _output->info()->strides_in_bytes()[1] + _depth_offset * _output->info()->strides_in_bytes()[2];
-    uint8_t           *output_ptr                        = _output->buffer() + offset_to_first_elements_in_bytes;
-
-    Iterator input(_input, window);
-    Iterator output(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto in_ptr  = reinterpret_cast<const float *>(input.ptr());
-        const auto out_ptr = reinterpret_cast<float *>(output_ptr + output.offset());
-
-        vst1q_f32(out_ptr, vld1q_f32(in_ptr));
-    },
-    input, output);
+    (*_func)(_input, _output, std::make_pair(_left_right, _top_bottom), _depth_offset, window);
 }

diff --git a/src/core/NEON/kernels/NEDepthConvertKernel.cpp b/src/core/NEON/kernels/NEDepthConvertKernel.cpp
index 56612a7..d97a20b 100644
--- a/src/core/NEON/kernels/NEDepthConvertKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConvertKernel.cpp

@@ -40,45 +40,91 @@
 } // namespace arm_compute
 
 NEDepthConvertKernel::NEDepthConvertKernel()
-    : _policy(), _shift(0)
+    : _input(nullptr), _output(nullptr), _policy(), _shift(0), _fixed_point_position_input(0), _fixed_point_position_output(0)
 {
 }
 
-void NEDepthConvertKernel::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void NEDepthConvertKernel::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(shift >= 8);
-    ARM_COMPUTE_ERROR_ON(input == output);
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data_types must be different");
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::F32);
 
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && (output->info()->data_type() != DataType::F32),
-                             "Only data_types supported [in] QS8 ->  [out] F32");
+    _input  = input;
+    _output = input;
+    _policy = policy;
+    _shift  = shift;
+
+    if(output != nullptr)
+    {
+        // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
+        set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::U32, DataType::S32, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+        // Set output
+        _output = output;
+    }
+
+    // Set initial fixed point position of input and output
+    _fixed_point_position_input  = input->info()->fixed_point_position();
+    _fixed_point_position_output = _output->info()->fixed_point_position();
+
+    // Set the fixed point position to the output tensor if needed
+    if(is_data_type_fixed_point(input->info()->data_type()) && is_data_type_fixed_point(_output->info()->data_type()))
+    {
+        // If in-place set the fixed point position of the output tensor to be equal to shift
+        _fixed_point_position_output = (_input == _output) ? static_cast<int>(_shift) : _fixed_point_position_output;
+        // Set fixed point position to output tensor
+        _output->info()->set_fixed_point_position(_fixed_point_position_output);
+    }
+
+    ARM_COMPUTE_ERROR_ON(shift >= 8 && (!is_data_type_fixed_point(input->info()->data_type()) && !is_data_type_fixed_point(output->info()->data_type())));
+    ARM_COMPUTE_ERROR_ON(input == output && (data_size_from_type(input->info()->data_type()) != data_size_from_type(output->info()->data_type())));
 
     ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::S16 && output->info()->data_type() != DataType::U16
                                                                             && output->info()->data_type() != DataType::S32),
                              "Only data_types supported [in] U8 -> [out] U16, S16, S32");
 
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && (output->info()->data_type() != DataType::QS8 && output->info()->data_type() != DataType::F32),
+                             "Only data_types supported [in] QS8 ->  [out] QS8, F32");
+
     ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32),
                              "Only data_types supported [in] U16 ->  [out] U8, U32");
 
     ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::S32),
                              "Only data_types supported [in] S16 ->  [out] U8, S32");
 
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8),
-                             "Only data_types supported [in] F32 ->  [out] QS8");
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS16 && (output->info()->data_type() != DataType::QS16 && output->info()->data_type() != DataType::F32),
+                             "Only data_types supported [in] QS16 ->  [out] QS16, F32");
 
-    _policy = policy;
-    _shift  = shift;
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8 && output->info()->data_type() != DataType::QS16),
+                             "Only data_types supported [in] F32 ->  [out] QS8, QS16");
 
     constexpr unsigned int num_elems_processed_per_iteration = 16;
-    INESimpleKernel::configure(input, output, num_elems_processed_per_iteration);
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    if(output != nullptr)
+    {
+        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+        update_window_and_padding(win, input_access, output_access);
+        output_access.set_valid_region(win, input->info()->valid_region());
+    }
+    else
+    {
+        // In-place computation
+        update_window_and_padding(win, input_access);
+    }
+    ICPPKernel::configure(win);
 }
 
-void NEDepthConvertKernel::run(const Window &window)
+void NEDepthConvertKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(nullptr == _input);
     ARM_COMPUTE_ERROR_ON(nullptr == _output);
     ARM_COMPUTE_ERROR_ON(_input == _output);
@@ -86,37 +132,10 @@
     Iterator input(_input, window);
     Iterator output(_output, window);
 
+    bool in_place = (_input == _output);
+
     switch(_input->info()->data_type())
     {
-        case DataType::QS8:
-        {
-            const int fixed_point_position = _input->info()->fixed_point_position();
-
-            switch(_output->info()->data_type())
-            {
-                case DataType::F32:
-                {
-                    /* Up-conversion QS8 -> F32 */
-                    execute_window_loop(window, [&](const Coordinates & id)
-                    {
-                        const int8x16_t texels_s8 = vld1q_s8(reinterpret_cast<const int8_t *>(input.ptr()));
-
-                        float32x4x2_t texels_low  = vcvt_f32_qs8(vget_low_s8(texels_s8), fixed_point_position);
-                        float32x4x2_t texels_high = vcvt_f32_qs8(vget_high_s8(texels_s8), fixed_point_position);
-
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()), texels_low.val[0]);
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, texels_low.val[1]);
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, texels_high.val[0]);
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, texels_high.val[1]);
-                    },
-                    input, output);
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("Output data type not supported");
-            }
-            break;
-        }
         case DataType::U8:
         {
             const int16x8_t b = vdupq_n_s16(_shift);
@@ -193,6 +212,49 @@
             }
             break;
         }
+        case DataType::QS8:
+        {
+            switch(_output->info()->data_type())
+            {
+                case DataType::QS8:
+                {
+                    const int relative_shift = _fixed_point_position_output - _fixed_point_position_input;
+                    /* Fixed point position conversion QS8 -> QS8 */
+                    if(relative_shift != 0 || !in_place)
+                    {
+                        const auto relative_shift_vec = vdupq_n_qs8(relative_shift);
+                        execute_window_loop(window, [&](const Coordinates & id)
+                        {
+                            const qint8x16_t texels_qs8 = vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr()));
+                            vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqrshlq_s8(texels_qs8, relative_shift_vec));
+                        },
+                        input, output);
+                    }
+                    break;
+                }
+                case DataType::F32:
+                {
+                    /* Up-conversion QS8 -> F32 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const qint8x16_t texels_qs8 = vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr()));
+
+                        float32x4x2_t texels_low  = vcvt_f32_qs8(vget_low_s8(texels_qs8), _fixed_point_position_input);
+                        float32x4x2_t texels_high = vcvt_f32_qs8(vget_high_s8(texels_qs8), _fixed_point_position_input);
+
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()), texels_low.val[0]);
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, texels_low.val[1]);
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, texels_high.val[0]);
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, texels_high.val[1]);
+                    },
+                    input, output);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+        }
         case DataType::S16:
         {
             switch(_output->info()->data_type())
@@ -346,13 +408,65 @@
             }
             break;
         }
+        case DataType::QS16:
+        {
+            switch(_output->info()->data_type())
+            {
+                case DataType::QS16:
+                {
+                    const int relative_shift = _fixed_point_position_output - _fixed_point_position_input;
+                    /* Fixed point position conversion QS16 -> QS16 */
+                    if(relative_shift != 0 || !in_place)
+                    {
+                        const auto relative_shift_vec = vdupq_n_qs16(relative_shift);
+                        execute_window_loop(window, [&](const Coordinates & id)
+                        {
+                            const qint16x8x2_t texels_qs16 =
+                            {
+                                {
+                                    vld1q_qs16(reinterpret_cast<qint16_t *>(input.ptr())),
+                                    vld1q_qs16(reinterpret_cast<qint16_t *>(input.ptr()) + 8)
+                                }
+                            };
+                            vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqrshlq_s16(texels_qs16.val[0], relative_shift_vec));
+                            vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()) + 8, vqrshlq_s16(texels_qs16.val[1], relative_shift_vec));
+                        },
+                        input, output);
+                    }
+                    break;
+                }
+                case DataType::F32:
+                {
+                    /* Up-conversion QS16 -> F32 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const int16x8x2_t texels_qs16 =
+                        {
+                            {
+                                vld1q_s16(reinterpret_cast<qint16_t *>(input.ptr())),
+                                vld1q_s16(reinterpret_cast<qint16_t *>(input.ptr()) + 8)
+                            }
+                        };
+
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()), vcvt_f32_qs16(vget_low_s16(texels_qs16.val[0]), _fixed_point_position_input));
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vcvt_f32_qs16(vget_high_s16(texels_qs16.val[0]), _fixed_point_position_input));
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vcvt_f32_qs16(vget_low_s16(texels_qs16.val[1]), _fixed_point_position_input));
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vcvt_f32_qs16(vget_high_s16(texels_qs16.val[1]), _fixed_point_position_input));
+                    },
+                    input, output);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+        }
         case DataType::F32:
         {
             switch(_output->info()->data_type())
             {
                 case DataType::QS8:
                 {
-                    const int fixed_point_position = _output->info()->fixed_point_position();
                     /* Down-conversion F32 -> QS8 */
                     execute_window_loop(window, [&](const Coordinates & id)
                     {
@@ -366,13 +480,39 @@
                             }
                         };
 
-                        const qint8x16_t texels_s8 = vcvtq_qs8_f32(texels_f32, fixed_point_position);
+                        const qint8x16_t texels_s8 = vqcvtq_qs8_f32(texels_f32, _fixed_point_position_output);
 
                         vst1q_s8(reinterpret_cast<int8_t *>(output.ptr()), texels_s8);
                     },
                     input, output);
                     break;
                 }
+                case DataType::QS16:
+                {
+                    /* Down-conversion F32 -> QS16 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const float32x4x2_t texels_f32_1 =
+                        {
+                            {
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr())),
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 4),
+                            }
+                        };
+                        const float32x4x2_t texels_f32_2 =
+                        {
+                            {
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 8),
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 12)
+                            }
+                        };
+
+                        vst1q_s16(reinterpret_cast<qint16_t *>(output.ptr()), vqcvtq_qs16_f32(texels_f32_1, _fixed_point_position_output));
+                        vst1q_s16(reinterpret_cast<qint16_t *>(output.ptr()) + 8, vqcvtq_qs16_f32(texels_f32_2, _fixed_point_position_output));
+                    },
+                    input, output);
+                    break;
+                }
                 default:
                     ARM_COMPUTE_ERROR("Output data type not supported");
             }

diff --git a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
new file mode 100644
index 0000000..70984f0
--- /dev/null
+++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp

@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+NEDequantizationLayerKernel::NEDequantizationLayerKernel()
+    : _input(nullptr), _output(nullptr), _min_max(nullptr)
+{
+}
+
+void NEDequantizationLayerKernel::configure(const ITensor *input, ITensor *output, const ITensor *min_max)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(min_max);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, DataType::F32, 0);
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+    _input   = input;
+    _output  = output;
+    _min_max = min_max;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+
+    // Configure window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     min_max_access(min_max->info(), 0, 0, 2, min_max->info()->dimension(1));
+
+    // Update window and padding
+    update_window_and_padding(win, input_access, output_access, min_max_access);
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEDequantizationLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Window window_input_output(window);
+    window_input_output.collapse_if_possible(INEKernel::window(), 3);
+    window_input_output.set(3, Window::Dimension(0, 1, 1));
+
+    Window window_min_max;
+    window_min_max.use_tensor_dimensions(_min_max->info()->tensor_shape());
+    window_min_max.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_min_max.collapse_if_possible(INEKernel::window(), 1);
+
+    Iterator input(_input, window_input_output);
+    Iterator output(_output, window_input_output);
+    Iterator min_max(_min_max, window_min_max);
+
+    execute_window_loop(window_min_max, [&](const Coordinates & id_batch)
+    {
+        // Get the min and max
+        const float min = *(reinterpret_cast<const float *>(min_max.ptr()) + 0);
+        const float max = *(reinterpret_cast<const float *>(min_max.ptr()) + 1);
+
+        const float32x4_t vmin    = vdupq_n_f32(min);
+        const float       range   = max - min;
+        const float32x4_t scaling = vdupq_n_f32(range / 255.0f);
+
+        // Uniformly map values to range 8bit integers, i.e. [min, max] -> [0, 255]
+        execute_window_loop(window_input_output, [&](const Coordinates & id)
+        {
+            // Get the input values
+            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr() + id_batch[1] * _input->info()->strides_in_bytes()[3]);
+
+            const uint8x8_t  val_u8       = vld1_u8(input_ptr);
+            const uint16x8_t val_u16      = vmovl_u8(val_u8);
+            const uint32x4_t val_u32_low  = vmovl_u16(vget_low_u16(val_u16));
+            const uint32x4_t val_u32_high = vmovl_u16(vget_high_u16(val_u16));
+            float32x4_t      val_low      = vcvtq_f32_u32(val_u32_low);
+            float32x4_t      val_high     = vcvtq_f32_u32(val_u32_high);
+
+            // Dequantize -> (q / 255.0 * range) + min
+            val_low  = vmulq_f32(val_low, scaling);
+            val_high = vmulq_f32(val_high, scaling);
+            val_low  = vaddq_f32(val_low, vmin);
+            val_high = vaddq_f32(val_high, vmin);
+
+            const float32x4x2_t dequantized = vuzpq_f32(val_low, val_high);
+
+            // Store the dequantized values
+            auto output_ptr = reinterpret_cast<float *>(output.ptr() + id_batch[1] * _output->info()->strides_in_bytes()[3]);
+            vst2q_f32(output_ptr, dequantized);
+        },
+        input, output);
+    },
+    min_max);
+}
\ No newline at end of file

diff --git a/src/core/NEON/kernels/NEDerivativeKernel.cpp b/src/core/NEON/kernels/NEDerivativeKernel.cpp
index bf7e097..a5680eb 100644
--- a/src/core/NEON/kernels/NEDerivativeKernel.cpp
+++ b/src/core/NEON/kernels/NEDerivativeKernel.cpp

@@ -214,8 +214,9 @@
     in, out_x, out_y);
 }
 
-void NEDerivativeKernel::run(const Window &window)
+void NEDerivativeKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);

diff --git a/src/core/NEON/kernels/NEDilateKernel.cpp b/src/core/NEON/kernels/NEDilateKernel.cpp
index 867cf77..3ee00a4 100644
--- a/src/core/NEON/kernels/NEDilateKernel.cpp
+++ b/src/core/NEON/kernels/NEDilateKernel.cpp

@@ -67,8 +67,9 @@
     INEKernel::configure(win);
 }
 
-void NEDilateKernel::run(const Window &window)
+void NEDilateKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
index effc50e..6631359 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp

@@ -54,6 +54,11 @@
     return vld1q_qs16(in);
 }
 
+inline qint32x4_t internal_vld1q(const qint32_t *in)
+{
+    return vld1q_s32(in);
+}
+
 // Internal store
 inline void internal_vst1q(float *p, const float32x4_t &v)
 {
@@ -72,6 +77,16 @@
     vst1q_qs16(p, v);
 }
 
+inline void internal_vst1q(qint32_t *p, const qint32x4_t &v)
+{
+    vst1q_s32(p, v);
+}
+
+inline void internal_vst1q(qint16_t *p, const qint32x4_t &v)
+{
+    vst1_qs16(p, vqmovn_qs32(v));
+}
+
 // Internal vdup
 inline float32x4_t internal_vdupq_n(float v)
 {
@@ -86,6 +101,11 @@
     return vdupq_n_qs16(v);
 }
 
+inline qint32x4_t internal_vdupq_n(qint32_t v)
+{
+    return vdupq_n_qs32(v);
+}
+
 // Internal vadd
 inline float32x4_t internal_vqaddq(const float32x4_t &x, const float32x4_t &y)
 {
@@ -99,6 +119,29 @@
 {
     return vqaddq_qs16(x, y);
 }
+inline qint32x4_t internal_vqaddq(const qint32x4_t &x, const qint32x4_t &y)
+{
+    return vqaddq_qs32(x, y);
+}
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+inline float16x8_t internal_vld1q(const float16_t *in)
+{
+    return vld1q_f16(in);
+}
+inline void internal_vst1q(float16_t *p, const float16x8_t &v)
+{
+    vst1q_f16(p, v);
+}
+inline float16x8_t internal_vdupq_n(float16_t v)
+{
+    return vdupq_n_f16(v);
+}
+inline float16x8_t internal_vqaddq(const float16x8_t &x, const float16x8_t &y)
+{
+    return vaddq_f16(x, y);
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
 
 template <typename T1, typename T2, bool in_place>
 void accumulate_bias(ITensor *input, const ITensor *bias, const Window window, ITensor *output)
@@ -143,8 +186,8 @@
 
 void NEDirectConvolutionLayerBiasAccumulateKernel::configure(ITensor *input, const ITensor *bias, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32);
     ARM_COMPUTE_ERROR_ON(input->info()->fixed_point_position() != bias->info()->fixed_point_position());
     if(output != nullptr)
     {
@@ -179,26 +222,53 @@
     INEKernel::configure(win);
 
     // Set appropriate function
-    if(input->info()->data_type() == DataType::F32)
+    switch(input->info()->data_type())
     {
-        _func = (output == nullptr) ? &accumulate_bias<float, float, true> : &accumulate_bias<float, float, false>;
-    }
-    else if(input->info()->data_type() == DataType::QS8)
-    {
-        _func = (output == nullptr) ? &accumulate_bias<qint8_t, qint8_t, true> : &accumulate_bias<qint8_t, qint8_t, false>;
-    }
-    else if(input->info()->data_type() == DataType::QS16 && bias->info()->data_type() == DataType::QS8)
-    {
-        _func = (output == nullptr) ? &accumulate_bias<qint16_t, qint8_t, true> : &accumulate_bias<qint16_t, qint8_t, false>;
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+        case DataType::QS8:
+        {
+            _func = (output == nullptr) ? &accumulate_bias<qint8_t, qint8_t, true> : &accumulate_bias<qint8_t, qint8_t, false>;
+            break;
+        }
+        case DataType::QS16:
+        {
+            if(bias->info()->data_type() == DataType::QS8)
+            {
+                _func = (output == nullptr) ? &accumulate_bias<qint16_t, qint8_t, true> : &accumulate_bias<qint16_t, qint8_t, false>;
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Not implemented");
+            }
+            break;
+        }
+        case DataType::QS32:
+        {
+            _func = (output == nullptr) ? &accumulate_bias<qint32_t, qint16_t, true> : &accumulate_bias<qint32_t, qint16_t, false>;
+            break;
+        }
+#ifdef ARM_COMPUTE_ENABLE_FP16
+        case DataType::F16:
+        {
+            _func = (output == nullptr) ? &accumulate_bias<float16_t, float16_t, true> : &accumulate_bias<float16_t, float16_t, false>;
+            break;
+        }
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+        case DataType::F32:
+        {
+            _func = (output == nullptr) ? &accumulate_bias<float, float, true> : &accumulate_bias<float, float, false>;
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+            break;
+        }
     }
 }
 
-void NEDirectConvolutionLayerBiasAccumulateKernel::run(const Window &window)
+void NEDirectConvolutionLayerBiasAccumulateKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);

diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index d608898..c8e1113 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp

@@ -30,6 +30,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 
 #include <algorithm>
@@ -40,6 +41,81 @@
 namespace
 {
 template <unsigned int stridex>
+qint16x8_t internal_vld1q(const qint16_t *in);
+
+template <>
+qint16x8_t internal_vld1q<1>(const qint16_t *in)
+{
+    return vld1q_qs16(in);
+}
+
+template <>
+qint16x8_t internal_vld1q<2>(const qint16_t *in)
+{
+    const int16x8x2_t tmp = vld2q_s16(in);
+    return tmp.val[0];
+}
+
+template <>
+qint16x8_t internal_vld1q<3>(const qint16_t *in)
+{
+    const int16x8x3_t tmp = vld3q_s16(in);
+    return tmp.val[0];
+}
+
+inline qint16x8_t internal_vdupq_n(qint16_t v)
+{
+    return vdupq_n_qs16(v);
+}
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+template <unsigned int stridex>
+float16x8_t internal_vld1q(const float16_t *in);
+
+template <>
+float16x8_t internal_vld1q<1>(const float16_t *in)
+{
+    return vld1q_f16(in);
+}
+
+template <>
+float16x8_t internal_vld1q<2>(const float16_t *in)
+{
+    const float16x8x2_t tmp = vld2q_f16(in);
+    return tmp.val[0];
+}
+
+template <>
+float16x8_t internal_vld1q<3>(const float16_t *in)
+{
+    const float16x8x3_t tmp = vld3q_f16(in);
+    return tmp.val[0];
+}
+
+inline float16x8_t internal_vdupq_n(float16_t v)
+{
+    return vdupq_n_f16(v);
+}
+
+inline void internal_vst1q(float16_t *p, const float16x8_t &v)
+{
+    vst1q_f16(p, v);
+}
+
+float16x8_t internal_vmull(const float16x8_t &x, const float16x8_t &y, int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+    return vmulq_f16(x, y);
+}
+
+inline float16x8_t internal_vmlal(const float16x8_t &x, const float16x8_t &y, const float16x8_t &z, int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+    return vaddq_f16(x, vmulq_f16(y, z));
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
+template <unsigned int stridex>
 float32x4_t internal_vld1q(const float *in);
 
 template <>
@@ -62,6 +138,28 @@
     return tmp.val[0];
 }
 
+inline float32x4_t internal_vdupq_n(float v)
+{
+    return vdupq_n_f32(v);
+}
+
+inline void internal_vst1q(float *p, const float32x4_t &v)
+{
+    vst1q_f32(p, v);
+}
+
+float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y, int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+    return vmulq_f32(x, y);
+}
+
+inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z, int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+    return vmlaq_f32(x, y, z);
+}
+
 template <unsigned int stridex>
 qint8x8_t internal_vld1q(const qint8_t *in);
 
@@ -85,28 +183,19 @@
     return tmp.val[0];
 }
 
-template <unsigned int stridex>
-qint16x8_t internal_vld1q(const qint16_t *in);
-
-template <>
-qint16x8_t internal_vld1q<1>(const qint16_t *in)
-{
-    return vld1q_s16(in);
-}
-
-inline float32x4_t internal_vdupq_n(float v)
-{
-    return vdupq_n_f32(v);
-}
-
 inline qint8x8_t internal_vdupq_n(qint8_t v)
 {
     return vdup_n_qs8(v);
 }
 
-inline void internal_vst1q(float *p, const float32x4_t &v)
+inline qint16x8_t internal_vmull(const qint8x8_t &x, const qint8x8_t &y, int fixed_point_position)
 {
-    vst1q_f32(p, v);
+    return vmull_qs8(x, y, fixed_point_position);
+}
+
+inline qint16x8_t internal_vmlal(const qint16x8_t &x, const qint8x8_t &y, const qint8x8_t &z, int fixed_point_position)
+{
+    return vqmlal_qs8(x, y, z, fixed_point_position);
 }
 
 inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
@@ -114,28 +203,140 @@
     vst1q_qs16(p, v);
 }
 
-float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y, int fixed_point_position)
+inline void internal_vst1q(int *p, const qint32x4x2_t &v)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
-    return vmulq_f32(x, y);
+    vst1q_s32(p, v.val[0]);
+    vst1q_s32(p + 4, v.val[1]);
 }
 
-qint16x8_t internal_vmull(const qint8x8_t &x, const qint8x8_t &y, int fixed_point_position)
+template <unsigned int stridex>
+qint32x4x2_t internal_vld1q(const qint32_t *in);
+
+template <>
+qint32x4x2_t internal_vld1q<1>(const qint32_t *in)
 {
-    return vmull_qs8(x, y, fixed_point_position);
+    const qint32x4x2_t r =
+    {
+        {
+            vld1q_s32(in),
+            vld1q_s32(in + 4)
+        }
+    };
+    return r;
 }
 
-inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z, int fixed_point_position)
+inline qint32x4x2_t internal_vmull(const qint16x8_t &x, const qint16x8_t &y, int fixed_point_position)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
-    return vmlaq_f32(x, y, z);
+    const qint32x4x2_t r =
+    {
+        {
+            vmull_qs16(vget_low_s16(x), vget_low_s16(y), fixed_point_position),
+            vmull_qs16(vget_high_s16(x), vget_high_s16(y), fixed_point_position),
+        }
+    };
+    return r;
 }
 
-inline qint16x8_t internal_vmlal(const qint16x8_t &x, const qint8x8_t &y, const qint8x8_t &z, int fixed_point_position)
+inline qint32x4x2_t internal_vmlal(const qint32x4x2_t &x, const qint16x8_t &y, const qint16x8_t &z, int fixed_point_position)
 {
-    return vqmlal_qs8(x, y, z, fixed_point_position);
+    const qint32x4x2_t r =
+    {
+        {
+            vqmlal_qs16(x.val[0], vget_low_s16(y), vget_low_s16(z), fixed_point_position),
+            vqmlal_qs16(x.val[1], vget_high_s16(y), vget_high_s16(z), fixed_point_position)
+        }
+    };
+    return r;
 }
 
+constexpr int SmallTensorSizeOptim = 8;
+inline bool run_optim_small_tensor(const ITensor *t)
+{
+    return t->info()->dimension(Window::DimX) <= SmallTensorSizeOptim && t->info()->dimension(Window::DimY) <= SmallTensorSizeOptim;
+}
+
+// Optimized convolver for 1x1 kernels used only where input width and height are both <= 8
+// For big Z as in Input=7x7x832, this implementation is faster than the general code becuase it doesn't need to
+// store intermidiate results in memory. Temporary results are stored in NEON registers directly and then written to the output buffer.
+template <unsigned int stridex>
+class convolver_w1x1_i8x8_f32
+{
+public:
+    static void convolve(const Window &window, const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimX) > SmallTensorSizeOptim);
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimY) > SmallTensorSizeOptim);
+
+        const int          input_stride_y  = input->info()->strides_in_bytes().y();
+        const int          input_stride_z  = input->info()->strides_in_bytes().z();
+        const int          output_stride_y = output->info()->strides_in_bytes().y();
+        const int          output_stride_z = output->info()->strides_in_bytes().z();
+        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
+        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
+        const int          output_h        = output->info()->dimension(1);
+        const int          range_z         = window.z().end() - window.z().start();
+        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
+        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
+
+        // setup output window for the iterator
+        Window window_out = window;
+        window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
+        window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
+        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
+
+        // setup input window for the iterator
+        Window window_in = window;
+        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
+        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        Window   window_k = calculate_max_window(*weights->info(), Steps(1u));
+        Iterator out(output, window_out);
+        Iterator in(input, window_in);
+        Iterator k(weights, window_k);
+
+        const uint8_t *k_ptr = k.ptr();
+
+        execute_window_loop(window_out, [&](const Coordinates & id)
+        {
+            const uint8_t *input_ptr                    = in.ptr();
+            uint8_t       *out_ptr                      = out.ptr();
+            int            ih                           = 0;
+            int            oh                           = 0;
+            float32x4_t    accum0[SmallTensorSizeOptim] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
+            float32x4_t    accum1[SmallTensorSizeOptim] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
+            for(int oz = 0; oz < range_z; ++oz)
+            {
+                accum0[0] = accum0[1] = accum0[2] = accum0[3] = accum0[4] = accum0[5] = accum0[6] = accum0[7] = vdupq_n_f32(0.f);
+                accum1[0] = accum1[1] = accum1[2] = accum1[3] = accum1[4] = accum1[5] = accum1[6] = accum1[7] = vdupq_n_f32(0.f);
+                auto p_out_base                                                                               = out_ptr + oz * output_stride_z;
+                for(int p = 0; p < kernel_depth; ++p)
+                {
+                    const auto k_val = reinterpret_cast<const float *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
+                    const auto vk0   = internal_vdupq_n(*k_val);
+                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+                    {
+                        const int offset_xy = ih * input_stride_y;
+                        auto      in_val    = reinterpret_cast<const float *>(input_ptr + p * input_stride_z + offset_xy);
+                        auto      v_in0     = internal_vld1q<stridex>(in_val);
+                        auto      v_in1     = internal_vld1q<stridex>(in_val + 4);
+                        accum0[oh]          = vmlaq_f32(accum0[oh], vk0, v_in0);
+                        accum1[oh]          = vmlaq_f32(accum1[oh], vk0, v_in1);
+                    }
+                }
+                for(oh = 0; oh < output_h; ++oh)
+                {
+                    auto p_out = reinterpret_cast<float *>(p_out_base + oh * output_stride_y);
+                    vst1q_f32(p_out, accum0[oh]);
+                    vst1q_f32(p_out + 4, accum1[oh]);
+                }
+            }
+        },
+        in, out);
+    }
+};
+
 template <typename T1, typename T2, unsigned int stridex>
 class convolver_1x1
 {
@@ -169,8 +370,7 @@
         window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
         window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
 
-        Window window_k = calculate_max_window(*weights->info(), Steps(1u));
-
+        Window   window_k = calculate_max_window(*weights->info(), Steps(1u));
         Iterator out(output, window_out);
         Iterator in(input, window_in);
         Iterator k(weights, window_k);
@@ -204,6 +404,7 @@
                         }
                     }
                 }
+
                 // Step 2
                 for(int p = 1; p < kernel_depth; ++p)
                 {
@@ -226,6 +427,148 @@
     }
 };
 
+#ifdef ARM_COMPUTE_ENABLE_FP16
+inline float16x8x3_t load_matrix_row(const float16_t *ptr)
+{
+    /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
+       r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
+    const float16x8x3_t r =
+    {
+        {
+            vld1q_dup_f16(ptr),
+            vld1q_dup_f16(1 + ptr),
+            vld1q_dup_f16(2 + ptr)
+        }
+    };
+    return r;
+}
+
+template <unsigned int stridex>
+float16x8x2_t convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+                           int fixed_point_position);
+
+template <>
+float16x8x2_t convolve_3x3<1>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+                              int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+
+    const float16x8x3_t vtop =
+    {
+        {
+            vld1q_f16(in_top),
+            vld1q_f16(in_top + 8),
+            vld1q_f16(in_top + 16)
+        }
+    };
+    const float16x8x3_t vmid =
+    {
+        {
+            vld1q_f16(in_mid),
+            vld1q_f16(in_mid + 8),
+            vld1q_f16(in_mid + 16)
+        }
+    };
+    const float16x8x3_t vlow =
+    {
+        {
+            vld1q_f16(in_low),
+            vld1q_f16(in_low + 8),
+            vld1q_f16(in_low + 16)
+        }
+    };
+    float16x8x2_t out =
+    {
+        {
+            vmulq_f16(vtop.val[0], m0.val[0]),
+            vmulq_f16(vtop.val[1], m0.val[0])
+        }
+    };
+    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 1), m0.val[1]));
+    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 2), m0.val[2]));
+    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[0], m1.val[0]));
+    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 1), m1.val[1]));
+    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 2), m1.val[2]));
+    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[0], m2.val[0]));
+    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 1), m2.val[1]));
+    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 2), m2.val[2]));
+    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 1), m0.val[1]));
+    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 2), m0.val[2]));
+    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vmid.val[1], m1.val[0]));
+    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 1), m1.val[1]));
+    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 2), m1.val[2]));
+    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vlow.val[1], m2.val[0]));
+    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 1), m2.val[1]));
+    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 2), m2.val[2]));
+    return out;
+}
+
+template <>
+inline float16x8x2_t convolve_3x3<2>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+                                     int fixed_point_position)
+{
+    float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1);
+    out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 2);
+    out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 3);
+    return out;
+}
+
+template <>
+inline float16x8x2_t convolve_3x3<3>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+                                     int fixed_point_position)
+{
+    float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
+    return out;
+}
+
+template <unsigned int stridex>
+void store_results(float16_t *buffer, const float16x8x2_t &values);
+
+template <>
+void store_results<1>(float16_t *buffer, const float16x8x2_t &values)
+{
+    vst1q_f16(buffer, values.val[0]);
+    vst1q_f16(buffer + 8, values.val[1]);
+}
+
+template <>
+void store_results<2>(float16_t *buffer, const float16x8x2_t &values)
+{
+    vst1q_f16(buffer, values.val[0]);
+}
+
+template <>
+void store_results<3>(float16_t *buffer, const float16x8x2_t &values)
+{
+    vst1_f16(buffer, vget_low_f16(values.val[0]));
+}
+
+template <unsigned int stridex>
+void accumulate_results(float16_t *buffer, const float16x8x2_t &values);
+
+template <>
+void accumulate_results<1>(float16_t *buffer, const float16x8x2_t &values)
+{
+    vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0]));
+    vst1q_f16(buffer + 8, vaddq_f16(vld1q_f16(buffer + 8), values.val[1]));
+}
+
+template <>
+void accumulate_results<2>(float16_t *buffer, const float16x8x2_t &values)
+{
+    vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0]));
+}
+
+template <>
+void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values)
+{
+    vst1_f16(buffer, vadd_f16(vld1_f16(buffer), vget_low_f16(values.val[0])));
+}
+
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
 inline float32x4x3_t load_matrix_row(const float *ptr)
 {
     const float32x4x3_t r =
@@ -254,6 +597,159 @@
 }
 
 template <unsigned int stridex>
+float32x4x2_t convolve_5x5(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
+                           const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position);
+
+inline float32x4x3_t load_matrix_hi(const float *const m0, const float *const m1, const float *const m2)
+{
+    const float32x4x3_t m00 =
+    {
+        {
+            vld1q_dup_f32(m0),
+            vld1q_dup_f32(m1),
+            vld1q_dup_f32(m2)
+        }
+    };
+    return m00;
+}
+
+inline float32x4x2_t load_matrix_lo(const float *const m3, const float *const m4)
+{
+    const float32x4x2_t m00 =
+    {
+        {
+            vld1q_dup_f32(m3),
+            vld1q_dup_f32(m4)
+        }
+    };
+    return m00;
+}
+
+inline float32x4x3_t load_input(const float *const in)
+{
+    const float32x4x3_t vin =
+    {
+        {
+            vld1q_f32(in),
+            vld1q_f32(in + 4),
+            vld1q_f32(in + 8)
+        }
+    };
+    return vin;
+}
+
+template <>
+inline float32x4x2_t convolve_5x5<1>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
+                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+    const float32x4x3_t vin0 = load_input(in_0);
+    const float32x4x3_t vin1 = load_input(in_1);
+    const float32x4x3_t vin2 = load_input(in_2);
+    const float32x4x3_t vin3 = load_input(in_3);
+    const float32x4x3_t vin4 = load_input(in_4);
+    const float32x4x3_t m00  = load_matrix_hi(m0, 1 + m0, 2 + m0);
+    const float32x4x2_t m01  = load_matrix_lo(3 + m0, 4 + m0);
+    const float32x4x3_t m10  = load_matrix_hi(m1, 1 + m1, 2 + m1);
+    const float32x4x2_t m11  = load_matrix_lo(3 + m1, 4 + m1);
+    const float32x4x3_t m20  = load_matrix_hi(m2, 1 + m2, 2 + m2);
+    const float32x4x2_t m21  = load_matrix_lo(3 + m2, 4 + m2);
+    const float32x4x3_t m30  = load_matrix_hi(m3, 1 + m3, 2 + m3);
+    const float32x4x2_t m31  = load_matrix_lo(3 + m3, 4 + m3);
+    const float32x4x3_t m40  = load_matrix_hi(m4, 1 + m4, 2 + m4);
+    const float32x4x2_t m41  = load_matrix_lo(3 + m4, 4 + m4);
+
+    float32x4x2_t out =
+    {
+        {
+            vmulq_f32(vin0.val[0], m00.val[0]),
+            vmulq_f32(vin0.val[1], m00.val[0])
+        }
+    };
+
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 1), m00.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 2), m00.val[2]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 3), m01.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vin0.val[1], m01.val[1]);
+
+    out.val[0] = vmlaq_f32(out.val[0], vin1.val[0], m10.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 1), m10.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 2), m10.val[2]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 3), m11.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vin1.val[1], m11.val[1]);
+
+    out.val[0] = vmlaq_f32(out.val[0], vin2.val[0], m20.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 1), m20.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 2), m20.val[2]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 3), m21.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vin2.val[1], m21.val[1]);
+
+    out.val[0] = vmlaq_f32(out.val[0], vin3.val[0], m30.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 1), m30.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 2), m30.val[2]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 3), m31.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vin3.val[1], m31.val[1]);
+
+    out.val[0] = vmlaq_f32(out.val[0], vin4.val[0], m40.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 1), m40.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 2), m40.val[2]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 3), m41.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vin4.val[1], m41.val[1]);
+
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 1), m00.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 2), m00.val[2]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 3), m01.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vin0.val[2], m01.val[1]);
+
+    out.val[1] = vmlaq_f32(out.val[1], vin1.val[1], m10.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 1), m10.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 2), m10.val[2]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 3), m11.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vin1.val[2], m11.val[1]);
+
+    out.val[1] = vmlaq_f32(out.val[1], vin2.val[1], m20.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 1), m20.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 2), m20.val[2]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 3), m21.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vin2.val[2], m21.val[1]);
+
+    out.val[1] = vmlaq_f32(out.val[1], vin3.val[1], m30.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 1), m30.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 2), m30.val[2]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 3), m31.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vin3.val[2], m31.val[1]);
+
+    out.val[1] = vmlaq_f32(out.val[1], vin4.val[1], m40.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 1), m40.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 2), m40.val[2]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 3), m41.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vin4.val[2], m41.val[1]);
+
+    return out;
+}
+
+template <>
+inline float32x4x2_t convolve_5x5<2>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
+                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+    float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4, fixed_point_position);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
+    return out;
+}
+
+template <>
+inline float32x4x2_t convolve_5x5<3>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
+                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position)
+{
+    float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4, fixed_point_position);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
+    return out;
+}
+
+template <unsigned int stridex>
 float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position);
 
 template <>
@@ -294,17 +790,22 @@
     };
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
+
     out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
+
     out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
+
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
+
     out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
+
     out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
@@ -587,15 +1088,15 @@
                         1) Convolve plane 0 with kernel 0 and initialize the corresponding output plane with these values.
                         2) Convolve the remaining planes and accumulate the results in the output's plane which has been initialized in step 1.
             */
-
             for(int oz = 0; oz < num_planes_z; ++oz)
             {
+                const int zoffset    = id.z() + oz;
                 uint8_t *p_out_base = out_ptr + oz * output_stride_z;
                 // Step 1
                 {
-                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
                     const auto vk_r0    = load_matrix_row(ptr_k_r0);
                     const auto vk_r1    = load_matrix_row(ptr_k_r1);
                     const auto vk_r2    = load_matrix_row(ptr_k_r2);
@@ -616,17 +1117,19 @@
                 // Step 2
                 for(int p = 1; p < kernel_depth; ++p)
                 {
-                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto vk_r0    = load_matrix_row(ptr_k_r0);
-                    const auto vk_r1    = load_matrix_row(ptr_k_r1);
-                    const auto vk_r2    = load_matrix_row(ptr_k_r2);
+                    const uint8_t *ptr_k_base = k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w;
+                    const uint8_t *input_base = input_ptr + p * input_stride_z;
+                    const auto     ptr_k_r0   = reinterpret_cast<const T1 *>(ptr_k_base);
+                    const auto     ptr_k_r1   = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y);
+                    const auto     ptr_k_r2   = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y * 2);
+                    const auto     vk_r0      = load_matrix_row(ptr_k_r0);
+                    const auto     vk_r1      = load_matrix_row(ptr_k_r1);
+                    const auto     vk_r2      = load_matrix_row(ptr_k_r2);
                     for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
                     {
-                        auto in_top = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 0) * input_stride_y);
-                        auto in_mid = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 1) * input_stride_y);
-                        auto in_low = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 2) * input_stride_y);
+                        auto in_top = reinterpret_cast<const T1 *>(input_base + (ih + 0) * input_stride_y);
+                        auto in_mid = reinterpret_cast<const T1 *>(input_base + (ih + 1) * input_stride_y);
+                        auto in_low = reinterpret_cast<const T1 *>(input_base + (ih + 2) * input_stride_y);
                         auto p_out  = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
                         for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
                             in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
@@ -642,6 +1145,118 @@
     }
 };
 
+template <typename T1, typename T2, unsigned int stridex>
+class convolver_5x5
+{
+public:
+    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+    {
+        ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
+        const int          input_stride_x       = input->info()->strides_in_bytes().x();
+        const int          input_stride_y       = input->info()->strides_in_bytes().y();
+        const int          input_stride_z       = input->info()->strides_in_bytes().z();
+        const int          output_stride_y      = output->info()->strides_in_bytes().y();
+        const int          output_stride_z      = output->info()->strides_in_bytes().z();
+        const int          kernel_stride_x      = weights->info()->strides_in_bytes().x();
+        const int          kernel_stride_y      = weights->info()->strides_in_bytes().y();
+        const int          kernel_stride_z      = weights->info()->strides_in_bytes().z();
+        const int          kernel_stride_w      = weights->info()->strides_in_bytes()[3];
+        const int          output_w             = output->info()->dimension(0);
+        const int          output_h             = output->info()->dimension(1);
+        const int          num_planes_z         = window.z().end() - window.z().start();
+        const int          delta_input          = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
+        const int          kernel_depth         = weights->info()->dimension(Window::DimZ);
+        const unsigned int conv_stride_y        = std::get<1>(conv_info.stride());
+        const unsigned int conv_pad_x           = std::get<0>(conv_info.pad());
+        const unsigned int conv_pad_y           = std::get<1>(conv_info.pad());
+        const int          fixed_point_position = input->info()->fixed_point_position();
+
+        // setup output window for the iterator
+        Window window_out = window;
+        window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
+        window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
+        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
+
+        // setup input window for the iterator
+        Window window_in = window;
+        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
+        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        Window window_k = calculate_max_window(*weights->info(), Steps(1u));
+
+        Iterator out(output, window_out);
+        Iterator in(input, window_in);
+        Iterator k(weights, window_k);
+
+        const uint8_t *k_ptr = k.ptr();
+
+        execute_window_loop(window_out, [&](const Coordinates & id)
+        {
+            const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y;
+            uint8_t       *out_ptr   = out.ptr();
+            int            ih        = 0;
+            int            oh        = 0;
+            for(int oz = 0; oz < num_planes_z; ++oz)
+            {
+                const int zoffset    = id.z() + oz;
+                uint8_t *p_out_base = out_ptr + oz * output_stride_z;
+                // Step 1
+                {
+                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
+                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+                    {
+                        auto in_0  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
+                        auto in_1  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
+                        auto in_2  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
+                        auto in_3  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 3) * input_stride_y);
+                        auto in_4  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 4) * input_stride_y);
+                        auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
+                            in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
+                        {
+                            auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4, fixed_point_position);
+                            store_results<stridex>(p_out, vres);
+                        }
+                    }
+                }
+                // Step 2
+                for(int p = 1; p < kernel_depth; ++p)
+                {
+                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
+
+                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+                    {
+                        auto in_0  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 0) * input_stride_y);
+                        auto in_1  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 1) * input_stride_y);
+                        auto in_2  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 2) * input_stride_y);
+                        auto in_3  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 3) * input_stride_y);
+                        auto in_4  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 4) * input_stride_y);
+                        auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
+                            in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
+                        {
+                            auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4, fixed_point_position);
+                            accumulate_results<stridex>(p_out, vres);
+                        }
+                    }
+                }
+            }
+        },
+        in, out);
+    }
+};
+
 template <typename T1, typename T2>
 inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
                          const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
@@ -663,6 +1278,47 @@
     }
 }
 
+template <>
+inline void convolve_1x1<float, float>(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+                                       const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+{
+    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+    if(run_optim_small_tensor(input))
+    {
+        switch(conv_stride_x)
+        {
+            case 1:
+                convolver_w1x1_i8x8_f32<1>::convolve(window, input, weights, output, conv_info);
+                break;
+            case 2:
+                convolver_w1x1_i8x8_f32<2>::convolve(window, input, weights, output, conv_info);
+                break;
+            case 3:
+                convolver_w1x1_i8x8_f32<3>::convolve(window, input, weights, output, conv_info);
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Not implemented");
+        }
+    }
+    else
+    {
+        switch(conv_stride_x)
+        {
+            case 1:
+                convolver_1x1<float, float, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+                break;
+            case 2:
+                convolver_1x1<float, float, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+                break;
+            case 3:
+                convolver_1x1<float, float, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Not implemented");
+        }
+    }
+}
+
 template <typename T1, typename T2>
 inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
                          const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
@@ -683,10 +1339,33 @@
             ARM_COMPUTE_ERROR("Not implemented");
     }
 }
+
+template <typename T1, typename T2>
+inline void convolve_5x5(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+{
+    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+    switch(conv_stride_x)
+    {
+        case 1:
+            convolver_5x5<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        case 2:
+            convolver_5x5<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        case 3:
+            convolver_5x5<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+
 } // namespace
 
 NEDirectConvolutionLayerKernel::NEDirectConvolutionLayerKernel()
-    : _input(nullptr), _weights(nullptr), _output(nullptr), _conv_info(), _border_size(0), _kernel_size(0), _num_elems_read_per_iteration(0), _num_elems_written_per_iteration(0)
+    : _input(nullptr), _weights(nullptr), _output(nullptr), _conv_info(), _border_size(0), _kernel_size(0), _num_weight_elems_read_per_row(0), _num_elems_read_per_iteration(0),
+      _num_elems_written_per_iteration(0)
 {
 }
 
@@ -697,14 +1376,19 @@
 
 void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::QS16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 1 && (std::get<0>(conv_info.pad()) || std::get<1>(conv_info.pad())),
                              "Pad > 0 not supported for 1x1 weights");
     ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 3 && (std::get<0>(conv_info.pad()) > 1 || std::get<1>(conv_info.pad()) > 1),
                              "Pad > 1 not supported for 3x3 weights");
+    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 5 && (std::get<0>(conv_info.pad()) > 2 || std::get<1>(conv_info.pad()) > 2),
+                             "Pad > 2 not supported for 5x5 weights");
+
     ARM_COMPUTE_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported.");
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
 
     const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
     const unsigned int conv_pad_x    = std::get<0>(conv_info.pad());
@@ -717,53 +1401,88 @@
     _kernel_size = weights->info()->dimension(0);
     _border_size = BorderSize(conv_pad_y, conv_pad_x);
 
-    Window win = calculate_max_window(*output->info());
+    const unsigned int kernel_size = weights->info()->dimension(0);
+
+    // Get convolved dimensions
+    unsigned int output_width  = 0;
+    unsigned int output_height = 0;
+    std::tie(output_width, output_height) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_size, kernel_size, conv_info);
+
+    TensorShape output_shape = input->info()->tensor_shape();
+    output_shape.set(0, output_width);
+    output_shape.set(1, output_height);
+    output_shape.set(2, weights->info()->dimension(3));
+
+    DataType data_type = input->info()->data_type();
+
+    if(is_data_type_fixed_point(data_type))
+    {
+        // Promote data type in case of fixed point
+        data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32);
+    }
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, data_type, input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, output->info()->data_type());
 
     switch(_kernel_size)
     {
         case 1:
         {
-            _num_elems_written_per_iteration = (input->info()->data_type() == DataType::QS8) ? 8 : 4;
-            _num_elems_read_per_iteration    = conv_stride_x * _num_elems_written_per_iteration;
-
-            win = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration));
-            AccessWindowHorizontal input_access(input->info(), 0, _num_elems_read_per_iteration);
-            AccessWindowHorizontal output_access(output->info(), 0, _num_elems_written_per_iteration);
-            update_window_and_padding(win, input_access, output_access);
-            output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+            switch(input->info()->data_type())
+            {
+#ifdef ARM_COMPUTE_ENABLE_FP16
+                case DataType::F16:
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+                case DataType::QS8:
+                case DataType::QS16:
+                    _num_elems_written_per_iteration = 8;
+                    break;
+                case DataType::F32:
+                    if(run_optim_small_tensor(input))
+                    {
+                        _num_elems_written_per_iteration = 8;
+                    }
+                    else
+                    {
+                        _num_elems_written_per_iteration = 4;
+                    }
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Data type not supported.");
+                    break;
+            }
+            _num_weight_elems_read_per_row = kernel_size;
+            _num_elems_read_per_iteration  = conv_stride_x * _num_elems_written_per_iteration;
             break;
         }
         case 3:
+        case 5:
         {
-            if(input->info()->data_type() == DataType::F32)
+            switch(input->info()->data_type())
             {
-                _num_elems_read_per_iteration    = 12;
-                _num_elems_written_per_iteration = 16 >> conv_stride_x;
+                case DataType::F32:
+                    _num_weight_elems_read_per_row   = 4 + _kernel_size - 1;
+                    _num_elems_read_per_iteration    = 12;
+                    _num_elems_written_per_iteration = 16 >> conv_stride_x;
+                    break;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+                case DataType::F16:
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+                case DataType::QS8:
+                case DataType::QS16:
+                    _num_weight_elems_read_per_row   = 8 + _kernel_size - 1;
+                    _num_elems_read_per_iteration    = 24;
+                    _num_elems_written_per_iteration = 32 >> conv_stride_x;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Data type not supported.");
+                    break;
             }
-            else
-            {
-                _num_elems_read_per_iteration    = 24;
-                _num_elems_written_per_iteration = 32 >> conv_stride_x;
-            }
-
-            // Calculate right and bottom border
-            const unsigned int conv_stride_y = std::get<1>(_conv_info.stride());
-            const int          input_width   = input->info()->dimension(0);
-            const int          input_height  = input->info()->dimension(1);
-            const int          upper_bound_w = ceil_to_multiple(((output->info()->dimension(0) - 1) * conv_stride_x + _kernel_size), _num_elems_read_per_iteration) - conv_pad_x - input_width;
-            const int          upper_bound_h = ((output->info()->dimension(1) - 1) * conv_stride_y - conv_pad_y + _kernel_size) - input_height;
-            _border_size.right               = std::max(upper_bound_w, static_cast<int>(_kernel_size));
-            _border_size.bottom              = std::max(upper_bound_h, static_cast<int>(_kernel_size));
-
-            // Create window and update padding
-            win = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration));
-            AccessWindowStatic     input_access(input->info(), -conv_pad_x, -conv_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
-            AccessWindowStatic     weights_access(weights->info(), 0, 0, _kernel_size, _kernel_size);
-            AccessWindowHorizontal output_access(output->info(), 0, _num_elems_written_per_iteration);
-            update_window_and_padding(win, input_access, weights_access, output_access);
-            output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-            break;
         }
+        break;
         default:
         {
             ARM_COMPUTE_ERROR("Not implemented");
@@ -771,11 +1490,27 @@
         }
     }
 
+    // Calculate right and bottom border
+    const unsigned int conv_stride_y = std::get<1>(_conv_info.stride());
+    const int          input_width   = input->info()->dimension(0);
+    const int          input_height  = input->info()->dimension(1);
+    const int          upper_bound_w = ceil_to_multiple(((output->info()->dimension(0) - 1) * conv_stride_x + _kernel_size), _num_elems_read_per_iteration) - conv_pad_x - input_width;
+    const int          upper_bound_h = ((output->info()->dimension(1) - 1) * conv_stride_y - conv_pad_y + _kernel_size) - input_height;
+    _border_size.right               = std::max(upper_bound_w, static_cast<int>(_kernel_size));
+    _border_size.bottom              = std::max(upper_bound_h, static_cast<int>(_kernel_size));
+    Window                 win       = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration));
+    AccessWindowStatic     input_access(input->info(), -conv_pad_x, -conv_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
+    AccessWindowStatic     weights_access(weights->info(), 0, 0, _num_weight_elems_read_per_row, _kernel_size);
+    AccessWindowHorizontal output_access(output->info(), 0, _num_elems_written_per_iteration);
+    update_window_and_padding(win, input_access, weights_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
     INEKernel::configure(win);
 }
 
-void NEDirectConvolutionLayerKernel::run(const Window &window)
+void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
@@ -786,31 +1521,66 @@
     {
         case 1:
         {
-            if(_input->info()->data_type() == DataType::QS8)
+            switch(_input->info()->data_type())
             {
-                convolve_1x1<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
-            }
-            else
-            {
-                convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+                case DataType::QS8:
+                    convolve_1x1<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+                    break;
+                case DataType::QS16:
+                    convolve_1x1<qint16_t, qint32_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+                    break;
+                case DataType::F32:
+                    convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+                    break;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+                case DataType::F16:
+                    convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+                    break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+                default:
+                    ARM_COMPUTE_ERROR("Data type not supported");
+                    break;
             }
             break;
         }
         case 3:
         {
-            if(_input->info()->data_type() == DataType::QS8)
+            switch(_input->info()->data_type())
             {
-                convolve_3x3<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
-            }
-            else
-            {
-                convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+                case DataType::QS8:
+                    convolve_3x3<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+                    break;
+                case DataType::F32:
+                    convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+                    break;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+                case DataType::F16:
+                    convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+                    break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+                default:
+                    ARM_COMPUTE_ERROR("Data type not supported");
+                    break;
             }
             break;
         }
+        case 5:
+        {
+            switch(_input->info()->data_type())
+            {
+                case DataType::F32:
+                    convolve_5x5<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Data type not supported");
+                    break;
+            }
+            break;
+        }
+
         default:
         {
-            ARM_COMPUTE_ERROR("Only kernel sizes 1x1 and 3x3 are supported.");
+            ARM_COMPUTE_ERROR("Only kernel sizes 1x1, 3x3 and 5x5 are supported.");
             break;
         }
     }

diff --git a/src/core/NEON/kernels/NEErodeKernel.cpp b/src/core/NEON/kernels/NEErodeKernel.cpp
index 3985036..88c20f8 100644
--- a/src/core/NEON/kernels/NEErodeKernel.cpp
+++ b/src/core/NEON/kernels/NEErodeKernel.cpp

@@ -67,8 +67,9 @@
     INEKernel::configure(win);
 }
 
-void NEErodeKernel::run(const Window &window)
+void NEErodeKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/NEFastCornersKernel.cpp b/src/core/NEON/kernels/NEFastCornersKernel.cpp
index 9e8b552..919efd2 100644
--- a/src/core/NEON/kernels/NEFastCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEFastCornersKernel.cpp

@@ -388,8 +388,9 @@
     INEKernel::configure(win);
 }
 
-void NEFastCornersKernel::run(const Window &window)
+void NEFastCornersKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/NEFillArrayKernel.cpp b/src/core/NEON/kernels/NEFillArrayKernel.cpp
index 7e7e1c2..5a2e1a0 100644
--- a/src/core/NEON/kernels/NEFillArrayKernel.cpp
+++ b/src/core/NEON/kernels/NEFillArrayKernel.cpp

@@ -62,8 +62,9 @@
     return false;
 }
 
-void NEFillArrayKernel::run(const Window &window)
+void NEFillArrayKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index bd99242..9505a25 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
@@ -35,6 +36,63 @@
 
 using namespace arm_compute;
 
+namespace
+{
+template <typename T, unsigned int leftx, unsigned int rightx>
+void fill_constant_value_single_channel_special(ITensor *tensor, const Window &window, unsigned int right, unsigned int bottom, const PixelValue &constant_border_value);
+
+template <>
+inline void fill_constant_value_single_channel_special<float, 1u, 1u>(ITensor *tensor, const Window &window, unsigned int right, unsigned int bottom, const PixelValue &constant_border_value)
+{
+    float border_value;
+    constant_border_value.get(border_value);
+    uint8_t *const start_valid_region = tensor->ptr_to_element(tensor->info()->valid_region().anchor);
+    const size_t &width              = tensor->info()->valid_region().shape[0];
+    const size_t &height             = tensor->info()->valid_region().shape[1];
+    const int      stridey            = tensor->info()->strides_in_bytes()[1];
+
+    // Left and right border
+    Window vertical(window);
+    vertical.set(Window::DimY, Window::Dimension(0, height, 1));
+
+    Iterator vertical_it(tensor, vertical);
+
+    execute_window_loop(vertical, [&](const Coordinates &)
+    {
+        const auto row_start = reinterpret_cast<float *>(start_valid_region + vertical_it.offset());
+
+        // Fill left and right borders
+        *(row_start - 1) = border_value;
+        std::fill_n(row_start + width, right, border_value);
+    },
+    vertical_it);
+
+    // Top and bottom border
+    Iterator plane_it(tensor, window);
+
+    // Iterate over all XY planes
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        uint8_t *base_addr = start_valid_region + plane_it.offset();
+        // Top border
+        const auto row_start = reinterpret_cast<float *>(base_addr - stridey);
+        // Fill top rows including left/right borders
+        std::fill_n(row_start - 1, 1 + width + right, border_value);
+
+        // Bottom border
+        const unsigned low_border_size = height + bottom;
+        for(unsigned int i = height; i < low_border_size; ++i)
+        {
+            const auto row_start = reinterpret_cast<float *>(base_addr + i * stridey);
+
+            // Fill bottom rows including left/right borders
+            std::fill_n(row_start - 1, 1 + width + right, border_value);
+        }
+    },
+    plane_it);
+}
+} // namespace
+
 namespace arm_compute
 {
 class Coordinates;
@@ -47,7 +105,7 @@
 
 void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32);
 
     _tensor                = tensor;
     _border_size           = border_size;
@@ -59,12 +117,14 @@
     Window win;
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
     win.set(Window::DimY, Window::Dimension(0, 1, 1));
-    win.use_tensor_dimensions(_tensor->info(), Window::DimZ);
+    win.use_tensor_dimensions(_tensor->info()->tensor_shape(), Window::DimZ);
     INEKernel::configure(win);
 }
 
-void NEFillBorderKernel::run(const Window &window)
+void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
+
     // If there is no border: early exit
     if(_border_size.empty())
     {
@@ -100,9 +160,20 @@
                 case DataType::S32:
                     fill_constant_value_single_channel<int32_t>(window);
                     break;
+                case DataType::F16:
+                    static_assert(sizeof(half) == 2, "Float16_t must be 16 bit");
+                    fill_constant_value_single_channel<half>(window);
+                    break;
                 case DataType::F32:
                     static_assert(sizeof(float) == 4, "Float must be 32 bit");
-                    fill_constant_value_single_channel<float>(window);
+                    if(_border_size.left == 1 && _border_size.top == 1)
+                    {
+                        fill_constant_value_single_channel_special<float, 1u, 1u>(_tensor, window, _border_size.right, _border_size.bottom, _constant_border_value);
+                    }
+                    else
+                    {
+                        fill_constant_value_single_channel<float>(window);
+                    }
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Not handled");
@@ -133,6 +204,10 @@
                 case DataType::S32:
                     fill_replicate_single_channel<int32_t>(window);
                     break;
+                case DataType::F16:
+                    static_assert(sizeof(half) == 2, "Float16_t must be 16 bit");
+                    fill_replicate_single_channel<half>(window);
+                    break;
                 case DataType::F32:
                     static_assert(sizeof(float) == 4, "Float must be 32 bit");
                     fill_replicate_single_channel<float>(window);
@@ -214,6 +289,7 @@
     uint8_t *const start_valid_region = _tensor->ptr_to_element(_tensor->info()->valid_region().anchor);
     const size_t &width              = _tensor->info()->valid_region().shape[0];
     const size_t &height             = _tensor->info()->valid_region().shape[1];
+    const int      stridey            = _tensor->info()->strides_in_bytes()[1];
 
     // Left and right border
     Window vertical(window);
@@ -237,19 +313,21 @@
     // Iterate over all XY planes
     execute_window_loop(window, [&](const Coordinates & id)
     {
+        uint8_t *base_addr = start_valid_region + plane_it.offset();
         // Top border
         for(int i = -_border_size.top; i < 0; ++i)
         {
-            const auto row_start = reinterpret_cast<T *>(start_valid_region + plane_it.offset() + i * _tensor->info()->strides_in_bytes()[1]);
+            const auto row_start = reinterpret_cast<T *>(base_addr + i * stridey);
 
             // Fill top rows including left/right borders
             std::fill_n(row_start - _border_size.left, _border_size.left + width + _border_size.right, constant_border_value);
         }
 
         // Bottom border
-        for(unsigned int i = height; i < height + _border_size.bottom; ++i)
+        const unsigned low_border_size = height + _border_size.bottom;
+        for(unsigned int i = height; i < low_border_size; ++i)
         {
-            const auto row_start = reinterpret_cast<T *>(start_valid_region + plane_it.offset() + i * _tensor->info()->strides_in_bytes()[1]);
+            const auto row_start = reinterpret_cast<T *>(base_addr + i * stridey);
 
             // Fill bottom rows including left/right borders
             std::fill_n(row_start - _border_size.left, _border_size.left + width + _border_size.right, constant_border_value);

diff --git a/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp b/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp
index 699a5d9..017e259 100644
--- a/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp

@@ -57,12 +57,13 @@
     Window win;
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
     win.set(Window::DimY, Window::Dimension(0, 1, 1));
-    win.use_tensor_dimensions(_tensor->info(), Window::DimZ);
+    win.use_tensor_dimensions(_tensor->info()->tensor_shape(), Window::DimZ);
     INEKernel::configure(win);
 }
 
-void NEFillInnerBorderKernel::run(const Window &window)
+void NEFillInnerBorderKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/NEFloorKernel.cpp b/src/core/NEON/kernels/NEFloorKernel.cpp
new file mode 100644
index 0000000..dd85ac1
--- /dev/null
+++ b/src/core/NEON/kernels/NEFloorKernel.cpp

@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEFloorKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+void NEFloorKernel::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    set_data_type_if_unknown(*input->info(), DataType::F32);
+    set_data_type_if_unknown(*output->info(), DataType::F32);
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEFloorKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const float32x4_t res = vfloorq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())));
+        vst1q_f32(reinterpret_cast<float *>(output.ptr()), res);
+    },
+    input, output);
+}

diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index 3ff8b7b..ae5d456 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp

@@ -85,10 +85,10 @@
         const uint16x4x4_t data =
         {
             {
-                vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 0 * in_stride)),
-                vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 1 * in_stride)),
-                vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 2 * in_stride)),
-                vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 3 * in_stride)),
+                vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 0 * in_stride)),
+                vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 1 * in_stride)),
+                vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 2 * in_stride)),
+                vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 3 * in_stride)),
             }
         };
         vst4_u16(reinterpret_cast<uint16_t *>(out.ptr()), data);
@@ -113,10 +113,10 @@
         const uint32x4x4_t data =
         {
             {
-                vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 0 * in_stride)),
-                vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 1 * in_stride)),
-                vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 2 * in_stride)),
-                vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 3 * in_stride))
+                vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 0 * in_stride)),
+                vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 1 * in_stride)),
+                vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 2 * in_stride)),
+                vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 3 * in_stride))
             }
         };
         vst4q_u32(reinterpret_cast<uint32_t *>(out.ptr()), data);
@@ -132,11 +132,20 @@
 
 void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16,
+                                                  DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    TensorShape output_shape = input->info()->tensor_shape();
+    output_shape.set(0, input->info()->dimension(0) * 4);
+    output_shape.set(1, std::ceil(input->info()->dimension(1) / 4.0f));
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(0) * 4);
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != std::ceil(input->info()->dimension(1) / 4.0f));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
 
     _input  = input;
     _output = output;
@@ -173,8 +182,9 @@
     INEKernel::configure(win);
 }
 
-void NEGEMMInterleave4x4Kernel::run(const Window &window)
+void NEGEMMInterleave4x4Kernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);

diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
index 3558c68..cbba446 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp

@@ -81,8 +81,9 @@
     INEKernel::configure(win);
 }
 
-void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window)
+void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
index 7a3bae5..fb07cb0 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp

@@ -45,10 +45,10 @@
 
 void NEGEMMMatrixAccumulateBiasesKernel::configure(ITensor *accum, const ITensor *biases)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
-    ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(biases, accum);
+    ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
 
     _biases = biases;
     _accum  = accum;
@@ -58,11 +58,9 @@
     // Configure kernel window
     Window win = calculate_max_window(*accum->info(), Steps(num_elems_processed_per_iteration));
 
-    AccessWindowStatic biases_access(biases->info(), 0, 0, biases->info()->dimension(0), biases->info()->dimension(1));
-
     update_window_and_padding(win,
                               AccessWindowHorizontal(accum->info(), 0, num_elems_processed_per_iteration),
-                              biases_access);
+                              AccessWindowStatic(biases->info(), 0, 0, win.x().end(), biases->info()->tensor_shape().y()));
 
     AccessWindowHorizontal output_access(accum->info(), 0, num_elems_processed_per_iteration);
 
@@ -74,8 +72,9 @@
     INEKernel::configure(win);
 }
 
-void NEGEMMMatrixAccumulateBiasesKernel::run(const Window &window)
+void NEGEMMMatrixAccumulateBiasesKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
@@ -109,6 +108,27 @@
             in0_out, in1);
             break;
         }
+#ifdef ARM_COMPUTE_ENABLE_FP16
+        case DataType::F16:
+        {
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const float16x8x2_t accum  = vld2q_f16(reinterpret_cast<const float16_t *>(in0_out.ptr()));
+                const float16x8x2_t biases = vld2q_f16(reinterpret_cast<const float16_t *>(in1.ptr()));
+                const float16x8x2_t res =
+                {
+                    {
+                        vaddq_f16(accum.val[0], biases.val[0]),
+                        vaddq_f16(accum.val[1], biases.val[1])
+                    }
+                };
+
+                vst2q_f16(reinterpret_cast<float16_t *>(in0_out.ptr()), res);
+            },
+            in0_out, in1);
+            break;
+        }
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
         case DataType::QS8:
         {
             execute_window_loop(window, [&](const Coordinates & id)
@@ -121,6 +141,21 @@
             in0_out, in1);
             break;
         }
+        case DataType::QS16:
+        {
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                qint16x8x2_t       accum  = vld2q_s16(reinterpret_cast<const qint16_t *>(in0_out.ptr()));
+                const qint16x8x2_t biases = vld2q_s16(reinterpret_cast<const qint16_t *>(in1.ptr()));
+
+                accum.val[0] = vqaddq_qs16(accum.val[0], biases.val[0]);
+                accum.val[1] = vqaddq_qs16(accum.val[1], biases.val[1]);
+
+                vst2q_s16(reinterpret_cast<qint16_t *>(in0_out.ptr()), accum);
+            },
+            in0_out, in1);
+            break;
+        }
         default:
             ARM_COMPUTE_ERROR("Data type not supported");
             break;

diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
index 71dd4c7..9dbce1d 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp

@@ -52,25 +52,8 @@
         const auto in_ptr  = reinterpret_cast<const float *>(in.ptr());
         const auto out_ptr = reinterpret_cast<float *>(out.ptr());
 
-        float32x4x4_t alpha_ab =
-        {
-            {
-                vld1q_f32(out_ptr + 0),
-                vld1q_f32(out_ptr + 4),
-                vld1q_f32(out_ptr + 8),
-                vld1q_f32(out_ptr + 12)
-            }
-        };
-
-        const float32x4x4_t c =
-        {
-            {
-                vld1q_f32(in_ptr + 0),
-                vld1q_f32(in_ptr + 4),
-                vld1q_f32(in_ptr + 8),
-                vld1q_f32(in_ptr + 12)
-            }
-        };
+        float32x4x4_t       alpha_ab = vld4q_f32(out_ptr);
+        const float32x4x4_t c        = vld4q_f32(in_ptr);
 
         // Multiply matrix C by its weight and accumulate
         alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32);
@@ -78,10 +61,7 @@
         alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32);
         alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32);
 
-        vst1q_f32(out_ptr + 0, alpha_ab.val[0]);
-        vst1q_f32(out_ptr + 4, alpha_ab.val[1]);
-        vst1q_f32(out_ptr + 8, alpha_ab.val[2]);
-        vst1q_f32(out_ptr + 12, alpha_ab.val[3]);
+        vst4q_f32(out_ptr, alpha_ab);
     },
     in, out);
 }
@@ -99,37 +79,22 @@
         const auto in_ptr  = reinterpret_cast<const float16_t *>(in.ptr());
         const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
 
-        float16x8x2_t alpha_ab =
-        {
-            {
-                vld1q_f16(out_ptr + 0),
-                vld1q_f16(out_ptr + 8)
-            }
-        };
-
-        float16x8x2_t c =
-        {
-            {
-                vld1q_f16(in_ptr + 0),
-                vld1q_f16(in_ptr + 8)
-            }
-        };
-
+        float16x8x2_t       alpha_ab = vld2q_f16(out_ptr);
+        const float16x8x2_t c        = vld2q_f16(in_ptr);
         // Multiply matrix C by its weight and accumulate
         alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16));
         alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16));
 
-        vst1q_f16(out_ptr + 0, alpha_ab.val[0]);
-        vst1q_f16(out_ptr + 8, alpha_ab.val[1]);
+        vst2q_f16(out_ptr + 0, alpha_ab);
     },
     in, out);
 }
-#endif
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
 
 void matrix_addition_qs8(const ITensor *input, ITensor *output, const Window &window, float beta)
 {
     const int        fixed_point_position = input->info()->fixed_point_position();
-    const qint8x16_t beta_qs8             = vdupq_n_qs8(scvt_qs8_f32(beta, fixed_point_position));
+    const qint8x16_t beta_qs8             = vdupq_n_qs8(sqcvt_qs8_f32(beta, fixed_point_position));
 
     Iterator in(input, window);
     Iterator out(output, window);
@@ -149,6 +114,31 @@
     },
     in, out);
 }
+
+void matrix_addition_qs16(const ITensor *input, ITensor *output, const Window &window, float beta)
+{
+    const int        fixed_point_position = input->info()->fixed_point_position();
+    const qint16x8_t beta_qs16            = vdupq_n_qs16(sqcvt_qs16_f32(beta, fixed_point_position));
+
+    Iterator in(input, window);
+    Iterator out(output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = reinterpret_cast<const qint16_t *>(in.ptr());
+        const auto out_ptr = reinterpret_cast<qint16_t *>(out.ptr());
+
+        qint16x8x2_t       alpha_ab = vld2q_s16(out_ptr);
+        const qint16x8x2_t c        = vld2q_s16(in_ptr);
+
+        // Multiply matrix C by its weight and accumulate
+        alpha_ab.val[0] = vqmlaq_qs16(alpha_ab.val[0], c.val[0], beta_qs16, fixed_point_position);
+        alpha_ab.val[1] = vqmlaq_qs16(alpha_ab.val[1], c.val[1], beta_qs16, fixed_point_position);
+
+        vst2q_s16(out_ptr, alpha_ab);
+    },
+    in, out);
+}
 } // namespace
 
 NEGEMMMatrixAdditionKernel::NEGEMMMatrixAdditionKernel()
@@ -158,8 +148,8 @@
 
 void NEGEMMMatrixAdditionKernel::configure(const ITensor *input, ITensor *output, float beta)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
@@ -173,11 +163,14 @@
         case DataType::QS8:
             _func = &matrix_addition_qs8;
             break;
+        case DataType::QS16:
+            _func = &matrix_addition_qs16;
+            break;
         case DataType::F16:
 #ifdef ARM_COMPUTE_ENABLE_FP16
             _func = &matrix_addition_f16;
             break;
-#endif
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
         default:
             ARM_COMPUTE_ERROR("Data type not supported");
             break;
@@ -190,8 +183,9 @@
     _beta = beta;
 }
 
-void NEGEMMMatrixAdditionKernel::run(const Window &window)
+void NEGEMMMatrixAdditionKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
index dcfbb13..6909082 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp

@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
 
+#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -50,15 +51,162 @@
 namespace
 {
 template <bool multiply_alpha>
-void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
+void vector_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info, float alpha)
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+    const auto width_matrix_b  = static_cast<int>(output->info()->dimension(0));
+    const auto in_b_stride     = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
+    const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
+
+    // The implementation computes 32 elements per iteration
+    const int window_start_x = 32 * info.thread_id;
+    const int window_step_x  = 32 * info.num_threads;
+    const int window_end_x   = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+    ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x, " (window_end_x - window_start_x) must be multiple of window_step_x");
+
+    Window win_out(window);
+    win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+    win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Window win_b;
+    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+    if(input1->info()->num_dimensions() >= 3)
+    {
+        win_b = window;
+    }
+    win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+    win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Iterator ina(input0, win_a);
+    Iterator inb(input1, win_b);
+    Iterator out(output, win_out);
+
+    const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
+    ARM_COMPUTE_UNUSED(alpha_f16);
+
+    execute_window_loop(win_out, [&](const Coordinates & id)
+    {
+        if(id.x() > width_matrix_b)
+        {
+            return;
+        }
+
+        float16x8_t acc0 = vdupq_n_f16(0.f);
+        float16x8_t acc1 = vdupq_n_f16(0.f);
+        float16x8_t acc2 = vdupq_n_f16(0.f);
+        float16x8_t acc3 = vdupq_n_f16(0.f);
+
+        auto vec_a    = reinterpret_cast<const float16_t *>(ina.ptr());
+        auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr());
+
+        const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
+        for(; vec_a <= (vec_a_end_addr - 4);)
+        {
+            const float16x4_t a0l = vld1_f16(vec_a);
+
+            float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+            float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+            float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+            float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+            float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
+            float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
+            float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
+            float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
+
+            acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0));
+            acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0));
+            acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0));
+            acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0));
+            acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1));
+            acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1));
+            acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1));
+            acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1));
+
+            matrix_b += 2 * in_b_stride;
+
+            b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+            b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+            b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+            b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+            b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
+            b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
+            b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
+            b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
+
+            acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2));
+            acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2));
+            acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2));
+            acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2));
+            acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3));
+            acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3));
+            acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3));
+            acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3));
+
+            vec_a += 4;
+            matrix_b += 2 * in_b_stride;
+        }
+
+        for(; vec_a < vec_a_end_addr;)
+        {
+            const float16_t   a0  = *vec_a;
+            const float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+            const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+            const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+            const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+
+            acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0));
+            acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0));
+            acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0));
+            acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0));
+
+            vec_a += 1;
+            matrix_b += in_b_stride;
+        }
+
+        // Multiply by the weight of matrix product (alpha)
+        if(multiply_alpha)
+        {
+            acc0 = vmulq_f16(acc0, alpha_f16);
+            acc1 = vmulq_f16(acc1, alpha_f16);
+            acc2 = vmulq_f16(acc2, alpha_f16);
+            acc3 = vmulq_f16(acc3, alpha_f16);
+        }
+
+        const auto vec_out = reinterpret_cast<float16_t *>(out.ptr());
+
+        vst1q_f16(vec_out + 0, acc0);
+        vst1q_f16(vec_out + 8, acc1);
+        vst1q_f16(vec_out + 16, acc2);
+        vst1q_f16(vec_out + 24, acc3);
+
+    },
+    ina, inb, out);
+#else  /* ARM_COMPUTE_ENABLE_FP16 */
+    ARM_COMPUTE_UNUSED(input0);
+    ARM_COMPUTE_UNUSED(input1);
+    ARM_COMPUTE_UNUSED(output);
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_ERROR("Not implemented");
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+}
+
+template <bool multiply_alpha>
+void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info, float alpha)
 {
     const auto width_matrix_b  = static_cast<int>(output->info()->dimension(0));
     const auto in_b_stride     = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
     const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
 
     // The implementation computes 16 elements per iteration
-    const int window_start_x = 16 * window.thread_id();
-    const int window_step_x  = 16 * window.num_threads();
+    const int window_start_x = 16 * info.thread_id;
+    const int window_step_x  = 16 * info.num_threads;
     // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
     const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
 
@@ -103,7 +251,7 @@
         asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
         asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
         asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
-#endif
+#endif /* __arm__ */
 
         auto vec_a_end_addr = vec_a + num_elems_vec_a;
         for(; vec_a <= (vec_a_end_addr - 4);)
@@ -126,7 +274,7 @@
             asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
             asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
             asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
-#endif
+#endif /* __arm__ */
 
             acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
             acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
@@ -206,7 +354,7 @@
 }
 
 template <bool multiply_alpha>
-void vector_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
+void vector_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info, float alpha)
 {
     const auto width_matrix_b       = static_cast<int>(output->info()->dimension(0));
     const auto in_b_stride          = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
@@ -214,8 +362,8 @@
     const int  fixed_point_position = input0->info()->fixed_point_position();
 
     // The implementation computes 32 elements per iteration
-    const int window_start_x = 32 * window.thread_id();
-    const int window_step_x  = 32 * window.num_threads();
+    const int window_start_x = 32 * info.thread_id;
+    const int window_step_x  = 32 * info.num_threads;
     // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
     const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
 
@@ -315,7 +463,7 @@
         // Multiply by the weight of the matrix product (alpha)
         if(multiply_alpha)
         {
-            const qint8x8_t alpha_qs8 = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position));
+            const qint8x8_t alpha_qs8 = vdup_n_qs8(sqcvt_qs8_f32(alpha, fixed_point_position));
             acc00_qs8                 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position);
             acc01_qs8                 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position);
             acc02_qs8                 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position);
@@ -334,6 +482,135 @@
 }
 
 template <bool multiply_alpha>
+void vector_matrix_multiply_qs16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info, float alpha)
+{
+    const auto width_matrix_b       = static_cast<int>(output->info()->dimension(0));
+    const auto in_b_stride          = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
+    const auto num_elems_vec_a      = static_cast<int>(input0->info()->dimension(0));
+    const int  fixed_point_position = input0->info()->fixed_point_position();
+
+    // The implementation computes 16 elements per iteration
+    const int window_start_x = 16 * info.thread_id;
+    const int window_step_x  = 16 * info.num_threads;
+    // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+    const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+    ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x, " (window_end_x - window_start_x) must be multiple of window_step_x");
+
+    Window win_out(window);
+    win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+    win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Window win_b;
+    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+    if(input1->info()->num_dimensions() >= 3)
+    {
+        win_b = window;
+    }
+    win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+    win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Iterator ina(input0, win_a);
+    Iterator inb(input1, win_b);
+    Iterator out(output, win_out);
+
+    execute_window_loop(win_out, [&](const Coordinates & id)
+    {
+        if(id.x() > width_matrix_b)
+        {
+            return;
+        }
+
+        // Reset accumulators
+        qint32x4_t acc00_qs32 = vdupq_n_qs32(0);
+        qint32x4_t acc01_qs32 = vdupq_n_qs32(0);
+        qint32x4_t acc02_qs32 = vdupq_n_qs32(0);
+        qint32x4_t acc03_qs32 = vdupq_n_qs32(0);
+
+        auto vec_a    = reinterpret_cast<const qint16_t *>(ina.ptr());
+        auto matrix_b = reinterpret_cast<const qint16_t *>(inb.ptr());
+
+        auto vec_a_end_addr = vec_a + num_elems_vec_a;
+        for(; vec_a <= (vec_a_end_addr - 2);)
+        {
+            const qint16x4_t a0 = vld1_dup_qs16(vec_a + 0);
+            const qint16x4_t a1 = vld1_dup_qs16(vec_a + 1);
+
+            const qint16x4_t b00 = vld1_qs16(matrix_b + 0 + 0 * in_b_stride);
+            const qint16x4_t b01 = vld1_qs16(matrix_b + 4 + 0 * in_b_stride);
+            const qint16x4_t b02 = vld1_qs16(matrix_b + 8 + 0 * in_b_stride);
+            const qint16x4_t b03 = vld1_qs16(matrix_b + 12 + 0 * in_b_stride);
+            const qint16x4_t b10 = vld1_qs16(matrix_b + 0 + 1 * in_b_stride);
+            const qint16x4_t b11 = vld1_qs16(matrix_b + 4 + 1 * in_b_stride);
+            const qint16x4_t b12 = vld1_qs16(matrix_b + 8 + 1 * in_b_stride);
+            const qint16x4_t b13 = vld1_qs16(matrix_b + 12 + 1 * in_b_stride);
+
+            // First accumulation
+            acc00_qs32 = vqmlal_qs16(acc00_qs32, b00, a0, fixed_point_position);
+            acc01_qs32 = vqmlal_qs16(acc01_qs32, b01, a0, fixed_point_position);
+            acc02_qs32 = vqmlal_qs16(acc02_qs32, b02, a0, fixed_point_position);
+            acc03_qs32 = vqmlal_qs16(acc03_qs32, b03, a0, fixed_point_position);
+
+            // Second accumulation
+            acc00_qs32 = vqmlal_qs16(acc00_qs32, b10, a1, fixed_point_position);
+            acc01_qs32 = vqmlal_qs16(acc01_qs32, b11, a1, fixed_point_position);
+            acc02_qs32 = vqmlal_qs16(acc02_qs32, b12, a1, fixed_point_position);
+            acc03_qs32 = vqmlal_qs16(acc03_qs32, b13, a1, fixed_point_position);
+
+            vec_a += 2;
+            matrix_b += 2 * in_b_stride;
+        }
+
+        for(; vec_a < vec_a_end_addr;)
+        {
+            const qint16x4_t a0 = vld1_dup_qs16(vec_a);
+
+            const qint16x4_t b00 = vld1_qs16(matrix_b + 0);
+            const qint16x4_t b01 = vld1_qs16(matrix_b + 4);
+            const qint16x4_t b02 = vld1_qs16(matrix_b + 8);
+            const qint16x4_t b03 = vld1_qs16(matrix_b + 12);
+
+            acc00_qs32 = vqmlal_qs16(acc00_qs32, b00, a0, fixed_point_position);
+            acc01_qs32 = vqmlal_qs16(acc01_qs32, b01, a0, fixed_point_position);
+            acc02_qs32 = vqmlal_qs16(acc02_qs32, b02, a0, fixed_point_position);
+            acc03_qs32 = vqmlal_qs16(acc03_qs32, b03, a0, fixed_point_position);
+
+            vec_a += 1;
+            matrix_b += in_b_stride;
+        }
+
+        // Convert back to qint16x4_t and saturate
+        qint16x4_t acc00_qs16 = vqmovn_qs32(acc00_qs32);
+        qint16x4_t acc01_qs16 = vqmovn_qs32(acc01_qs32);
+        qint16x4_t acc02_qs16 = vqmovn_qs32(acc02_qs32);
+        qint16x4_t acc03_qs16 = vqmovn_qs32(acc03_qs32);
+
+        // Multiply by the weight of the matrix product (alpha)
+        if(multiply_alpha)
+        {
+            const qint16x4_t alpha_qs16 = vdup_n_qs16(sqcvt_qs16_f32(alpha, fixed_point_position));
+            acc00_qs16                  = vqmul_qs16(acc00_qs16, alpha_qs16, fixed_point_position);
+            acc01_qs16                  = vqmul_qs16(acc01_qs16, alpha_qs16, fixed_point_position);
+            acc02_qs16                  = vqmul_qs16(acc02_qs16, alpha_qs16, fixed_point_position);
+            acc03_qs16                  = vqmul_qs16(acc03_qs16, alpha_qs16, fixed_point_position);
+        }
+
+        const auto mtx_out0 = reinterpret_cast<qint16_t *>(out.ptr());
+
+        // Store 16x4 output elements
+        vst1_qs16(mtx_out0 + 0, acc00_qs16);
+        vst1_qs16(mtx_out0 + 4, acc01_qs16);
+        vst1_qs16(mtx_out0 + 8, acc02_qs16);
+        vst1_qs16(mtx_out0 + 12, acc03_qs16);
+    },
+    ina, inb, out);
+}
+
+template <bool multiply_alpha>
 void matrix_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
 {
     const size_t in_b_stride          = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
@@ -386,7 +663,7 @@
         asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
         asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
         asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
-#endif
+#endif /* __arm__ */
 
         auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
         for(; mtx_b0 <= (mtx_b0_end_addr - 32);)
@@ -405,7 +682,7 @@
             asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
             asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
             asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
-#endif
+#endif /* __arm__ */
 
             // 4x4 block 0
             acc00 = vmlaq_f32(acc00, b00, a0);
@@ -496,7 +773,7 @@
             asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
             asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
             asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
-#endif
+#endif /* __arm__ */
 
             // 4x4 block 0
             acc00 = vmlaq_f32(acc00, b00, a0);
@@ -587,7 +864,7 @@
             asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
             asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
             asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
-#endif
+#endif /* __arm__ */
             // 4x4 block 0
             acc00 = vmlaq_f32(acc00, b00, a0);
             acc10 = vmlaq_f32(acc10, b00, a1);
@@ -639,8 +916,9 @@
 void matrix_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
 {
 #ifdef ARM_COMPUTE_ENABLE_FP16
-    const size_t in_b_stride = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
-    const size_t out_stride  = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());
+    const size_t in_b_stride          = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
+    const size_t out_stride           = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());
+    const int    num_elems_matrix_b_x = input1->info()->dimension(0);
 
     // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
     Window win_a(window);
@@ -662,9 +940,6 @@
     Iterator inb(input1, win_b);
     Iterator out(output, window);
 
-    // Number of iterations of inner loop. Since 8 is the number of accumulations per loop, num_it = (width_mtx_b / 4) / 8
-    const size_t num_it = ((input1->info()->dimension(0)) >> 2) >> 3;
-
     const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -710,10 +985,14 @@
 
         The size of the output tensor's XY-plane must be the following shape [ width * 8, height / 8 ]. All other dimensions must have the same size.
         */
-        for(size_t k = num_it; k > 0; mtx_a0 += 16, mtx_b0 += 32, --k)
+        const float16_t *mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
+
+        for(; mtx_b0 <= (mtx_b0_end_addr - 32);)
+
         {
             const float16x8_t p00 = vld1q_f16(mtx_a0);
             const float16x8_t p02 = vld1q_f16(mtx_a0 + 8);
+
             const float16x8_t q00 = vld1q_f16(mtx_b0);
             const float16x8_t q02 = vld1q_f16(mtx_b0 + 8);
             const float16x8_t q04 = vld1q_f16(mtx_b0 + 16);
@@ -738,6 +1017,24 @@
             c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5)));
             c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6)));
             c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7)));
+
+            mtx_a0 += 16;
+            mtx_b0 += 32;
+        }
+
+        for(; mtx_b0 < mtx_b0_end_addr;)
+
+        {
+            const float16x4_t p00 = vld1_f16(mtx_a0);
+            const float16x8_t q00 = vld1q_f16(mtx_b0);
+
+            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vget_lane_f16(p00, 0)));
+            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vget_lane_f16(p00, 1)));
+            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vget_lane_f16(p00, 2)));
+            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vget_lane_f16(p00, 3)));
+
+            mtx_a0 += 4;
+            mtx_b0 += 8;
         }
 
         if(multiply_alpha)
@@ -754,9 +1051,14 @@
         vst1q_f16(mtx_out + 3 * out_stride, c.val[3]);
     },
     ina, inb, out);
-#else
+#else  /* ARM_COMPUTE_ENABLE_FP16 */
+    ARM_COMPUTE_UNUSED(input0);
+    ARM_COMPUTE_UNUSED(input1);
+    ARM_COMPUTE_UNUSED(output);
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_ERROR("Not implemented");
-#endif
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
 }
 
 template <bool multiply_alpha>
@@ -768,7 +1070,7 @@
     const size_t    out_stride3          = out_stride1 * 3;
     const int       num_elems_matrix_b_x = input1->info()->dimension(0);
     const int       fixed_point_position = input0->info()->fixed_point_position();
-    const qint8x8_t alpha_qs8            = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position));
+    const qint8x8_t alpha_qs8            = vdup_n_qs8(sqcvt_qs8_f32(alpha, fixed_point_position));
     ARM_COMPUTE_UNUSED(alpha_qs8);
 
     // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
@@ -867,7 +1169,7 @@
             asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
             asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
             asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
-#endif
+#endif /* __arm__ */
 
             // Second accumulation
             acc00_qs16 = vqmlal_qs8(acc00_qs16, b02, a4, fixed_point_position);
@@ -992,6 +1294,120 @@
     ina, inb, out);
 }
 
+template <bool multiply_alpha>
+void matrix_matrix_multiply_qs16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
+{
+    const size_t     in_b_stride          = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
+    const size_t     out_stride1          = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());
+    const size_t     out_stride2          = out_stride1 * 2;
+    const size_t     out_stride3          = out_stride1 * 3;
+    const int        num_elems_matrix_b_x = input1->info()->dimension(0);
+    const int        fixed_point_position = input0->info()->fixed_point_position();
+    const qint16x4_t alpha_qs16           = vdup_n_qs16(sqcvt_qs16_f32(alpha, fixed_point_position));
+    ARM_COMPUTE_UNUSED(alpha_qs16);
+
+    // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));
+
+    Window win_b;
+    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+    if(input1->info()->num_dimensions() >= 3)
+    {
+        win_b = window;
+    }
+    // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the cols of the output matrix
+    win_b.set(Window::DimX, Window::Dimension(window.x().start() / 8, window.x().end() / 8, in_b_stride));
+    win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator ina(input0, win_a);
+    Iterator inb(input1, win_b);
+    Iterator out(output, window);
+
+    // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
+    // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 8x4 elements per iteration
+    // All the values needed for computing a single 8x4 block will be read from consecutive memory positions
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        auto mtx_a0 = reinterpret_cast<const qint16_t *>(ina.ptr());
+        auto mtx_b0 = reinterpret_cast<const qint16_t *>(inb.ptr());
+        auto mtx_b1 = mtx_b0 + in_b_stride;
+
+        qint32x4_t acc00_qs32 = vdupq_n_qs32(0);
+        qint32x4_t acc10_qs32 = vdupq_n_qs32(0);
+        qint32x4_t acc20_qs32 = vdupq_n_qs32(0);
+        qint32x4_t acc30_qs32 = vdupq_n_qs32(0);
+
+        qint32x4_t acc01_qs32 = vdupq_n_qs32(0);
+        qint32x4_t acc11_qs32 = vdupq_n_qs32(0);
+        qint32x4_t acc21_qs32 = vdupq_n_qs32(0);
+        qint32x4_t acc31_qs32 = vdupq_n_qs32(0);
+
+        // This for loop performs 1 accumulation
+        for(int k = 0; k <= (num_elems_matrix_b_x - 8); k += 8)
+        {
+            const qint16x4_t a0 = vld1_dup_qs16(mtx_a0 + 0);
+            const qint16x4_t a1 = vld1_dup_qs16(mtx_a0 + 1);
+            const qint16x4_t a2 = vld1_dup_qs16(mtx_a0 + 2);
+            const qint16x4_t a3 = vld1_dup_qs16(mtx_a0 + 3);
+
+            const qint16x4_t b00 = vld1_qs16(mtx_b0 + 0);
+            const qint16x4_t b01 = vld1_qs16(mtx_b0 + 4);
+
+            acc00_qs32 = vqmlal_qs16(acc00_qs32, b00, a0, fixed_point_position);
+            acc10_qs32 = vqmlal_qs16(acc10_qs32, b00, a1, fixed_point_position);
+            acc20_qs32 = vqmlal_qs16(acc20_qs32, b00, a2, fixed_point_position);
+            acc30_qs32 = vqmlal_qs16(acc30_qs32, b00, a3, fixed_point_position);
+            acc01_qs32 = vqmlal_qs16(acc01_qs32, b01, a0, fixed_point_position);
+            acc11_qs32 = vqmlal_qs16(acc11_qs32, b01, a1, fixed_point_position);
+            acc21_qs32 = vqmlal_qs16(acc21_qs32, b01, a2, fixed_point_position);
+            acc31_qs32 = vqmlal_qs16(acc31_qs32, b01, a3, fixed_point_position);
+
+            mtx_a0 += 4;
+            mtx_b0 += 8;
+            mtx_b1 += 8;
+        }
+
+        // Convert back to qint16x4_t and saturate
+        qint16x4_t acc00_qs16 = vqmovn_qs32(acc00_qs32);
+        qint16x4_t acc10_qs16 = vqmovn_qs32(acc10_qs32);
+        qint16x4_t acc20_qs16 = vqmovn_qs32(acc20_qs32);
+        qint16x4_t acc30_qs16 = vqmovn_qs32(acc30_qs32);
+
+        qint16x4_t acc01_qs16 = vqmovn_qs32(acc01_qs32);
+        qint16x4_t acc11_qs16 = vqmovn_qs32(acc11_qs32);
+        qint16x4_t acc21_qs16 = vqmovn_qs32(acc21_qs32);
+        qint16x4_t acc31_qs16 = vqmovn_qs32(acc31_qs32);
+
+        // Multiply by the weight of the matrix product (alpha)
+        if(multiply_alpha)
+        {
+            acc00_qs16 = vqmul_qs16(acc00_qs16, alpha_qs16, fixed_point_position);
+            acc10_qs16 = vqmul_qs16(acc10_qs16, alpha_qs16, fixed_point_position);
+            acc20_qs16 = vqmul_qs16(acc20_qs16, alpha_qs16, fixed_point_position);
+            acc30_qs16 = vqmul_qs16(acc30_qs16, alpha_qs16, fixed_point_position);
+            acc01_qs16 = vqmul_qs16(acc01_qs16, alpha_qs16, fixed_point_position);
+            acc11_qs16 = vqmul_qs16(acc11_qs16, alpha_qs16, fixed_point_position);
+            acc21_qs16 = vqmul_qs16(acc21_qs16, alpha_qs16, fixed_point_position);
+            acc31_qs16 = vqmul_qs16(acc31_qs16, alpha_qs16, fixed_point_position);
+        }
+
+        const auto mtx_out0 = reinterpret_cast<qint16_t *>(out.ptr());
+
+        // Store 8x4 output elements
+        vst1_qs16(mtx_out0 + 0, acc00_qs16);
+        vst1_qs16(mtx_out0 + 4, acc01_qs16);
+        vst1_qs16(mtx_out0 + out_stride1 + 0, acc10_qs16);
+        vst1_qs16(mtx_out0 + out_stride1 + 4, acc11_qs16);
+        vst1_qs16(mtx_out0 + out_stride2 + 0, acc20_qs16);
+        vst1_qs16(mtx_out0 + out_stride2 + 4, acc21_qs16);
+        vst1_qs16(mtx_out0 + out_stride3 + 0, acc30_qs16);
+        vst1_qs16(mtx_out0 + out_stride3 + 4, acc31_qs16);
+    },
+    ina, inb, out);
+}
 } // namespace
 
 NEGEMMMatrixMultiplyKernel::NEGEMMMatrixMultiplyKernel()
@@ -1001,10 +1417,7 @@
 
 void NEGEMMMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32, DataType::QS8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32, DataType::QS8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, DataType::QS8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32, DataType::QS8, DataType::QS16);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
 
@@ -1036,6 +1449,18 @@
                 num_elems_processed_per_iteration_x = 32;
                 break;
             }
+            case DataType::QS16:
+            {
+                num_elems_processed_per_iteration_x = 16;
+                break;
+            }
+#ifdef ARM_COMPUTE_ENABLE_FP16
+            case DataType::F16:
+            {
+                num_elems_processed_per_iteration_x = 32;
+                break;
+            }
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
             default:
             {
                 ARM_COMPUTE_ERROR("Data type not supported");
@@ -1049,7 +1474,7 @@
         AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration_x);
 
         update_window_and_padding(win,
-                                  AccessWindowHorizontal(input0->info(), 0, num_elems_processed_per_iteration_x),
+                                  AccessWindowStatic(input0->info(), 0, 0, input0->info()->tensor_shape().x(), 1),
                                   AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration_x),
                                   output_access);
 
@@ -1073,13 +1498,18 @@
                 num_elems_processed_per_iteration_x = 32;
                 break;
             }
-            case DataType::F16:
+            case DataType::QS16:
             {
-#ifdef ARM_COMPUTE_ENABLE_FP16
                 num_elems_processed_per_iteration_x = 8;
                 break;
-#endif
             }
+#ifdef ARM_COMPUTE_ENABLE_FP16
+            case DataType::F16:
+            {
+                num_elems_processed_per_iteration_x = 8;
+                break;
+            }
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
             default:
             {
                 ARM_COMPUTE_ERROR("Data type not supported");
@@ -1094,7 +1524,7 @@
 
         update_window_and_padding(win,
                                   AccessWindowRectangle(input0->info(), 0, 0, 4, 1, 1.f, 0.25f),
-                                  AccessWindowTranspose(input1->info(), 0, 0, 4, 1, 0.f, 0.25f),
+                                  AccessWindowStatic(input1->info(), 0, 0, input1->info()->tensor_shape().x(), ceil_to_multiple(input1->info()->tensor_shape().y(), 4)),
                                   output_access);
 
         output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
@@ -1103,30 +1533,44 @@
     }
 }
 
-void NEGEMMMatrixMultiplyKernel::run(const Window &window)
+void NEGEMMMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
     bool multiply_alpha = std::abs(1.0f - _alpha) > 0.00001f;
 
-    // Check if the output tensor is a vector and the data type is F32. If so,the kernel runs the vector-matrix multiplication
+    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
     if((_output->info()->dimension(1) == 1))
     {
         switch(_input0->info()->data_type())
         {
             case DataType::F32:
             {
-                multiply_alpha ? vector_matrix_multiply_f32<true>(_input0, _input1, _output, window, _alpha) :
-                vector_matrix_multiply_f32<false>(_input0, _input1, _output, window, _alpha);
+                multiply_alpha ? vector_matrix_multiply_f32<true>(_input0, _input1, _output, window, info, _alpha) :
+                vector_matrix_multiply_f32<false>(_input0, _input1, _output, window, info, _alpha);
                 break;
             }
             case DataType::QS8:
             {
-                multiply_alpha ? vector_matrix_multiply_qs8<true>(_input0, _input1, _output, window, _alpha) :
-                vector_matrix_multiply_qs8<false>(_input0, _input1, _output, window, _alpha);
+                multiply_alpha ? vector_matrix_multiply_qs8<true>(_input0, _input1, _output, window, info, _alpha) :
+                vector_matrix_multiply_qs8<false>(_input0, _input1, _output, window, info, _alpha);
                 break;
             }
+            case DataType::QS16:
+            {
+                multiply_alpha ? vector_matrix_multiply_qs16<true>(_input0, _input1, _output, window, info, _alpha) :
+                vector_matrix_multiply_qs16<false>(_input0, _input1, _output, window, info, _alpha);
+                break;
+            }
+#ifdef ARM_COMPUTE_ENABLE_FP16
+            case DataType::F16:
+            {
+                multiply_alpha ? vector_matrix_multiply_f16<true>(_input0, _input1, _output, window, info, _alpha) :
+                vector_matrix_multiply_f16<false>(_input0, _input1, _output, window, info, _alpha);
+                break;
+            }
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
             default:
             {
                 ARM_COMPUTE_ERROR("Data type not supported");
@@ -1150,14 +1594,20 @@
                 matrix_matrix_multiply_qs8<false>(_input0, _input1, _output, window, _alpha);
                 break;
             }
+            case DataType::QS16:
+            {
+                multiply_alpha ? matrix_matrix_multiply_qs16<true>(_input0, _input1, _output, window, _alpha) :
+                matrix_matrix_multiply_qs16<false>(_input0, _input1, _output, window, _alpha);
+                break;
+            }
+#ifdef ARM_COMPUTE_ENABLE_FP16
             case DataType::F16:
             {
-#ifdef ARM_COMPUTE_ENABLE_FP16
                 multiply_alpha ? matrix_matrix_multiply_f16<true>(_input0, _input1, _output, window, _alpha) :
                 matrix_matrix_multiply_f16<false>(_input0, _input1, _output, window, _alpha);
                 break;
-#endif
             }
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
             default:
             {
                 ARM_COMPUTE_ERROR("Data type not supported");

diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index ccf5cb4..7f4ee1e 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp

@@ -43,7 +43,8 @@
 
 void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16,
+                                                  DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
     TensorShape  output_shape{ input->info()->tensor_shape() };
@@ -56,28 +57,33 @@
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
 
     const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
-    const float        scale_x                           = num_elems_processed_per_iteration;
+    const int          scale_x                           = num_elems_processed_per_iteration;
 
     _input  = input;
     _output = output;
 
     // Configure kernel window
-    Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    ARM_COMPUTE_ERROR_ON_MSG((win.x().end() / scale_x) == 0, "Transposed shape would be 0 in the second dimension");
+
     AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
 
     update_window_and_padding(win,
                               AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
                               output_access);
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->info()->tensor_shape()));
 
     INEKernel::configure(win);
 }
 
-void NEGEMMTranspose1xWKernel::run(const Window &window)
+void NEGEMMTranspose1xWKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp b/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
index 419f482..048c229 100644
--- a/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp

@@ -64,8 +64,9 @@
     INEKernel::configure(win);
 }
 
-void NEGaussian3x3Kernel::run(const Window &window)
+void NEGaussian3x3Kernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp b/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
index f872cc2..b62e281 100644
--- a/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
+++ b/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp

@@ -73,8 +73,9 @@
     INEKernel::configure(win);
 }
 
-void NEGaussian5x5HorKernel::run(const Window &window)
+void NEGaussian5x5HorKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
@@ -116,8 +117,8 @@
 
 void NEGaussian5x5VertKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
 {
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::S16);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
 
     _input  = input;
     _output = output;
@@ -140,8 +141,9 @@
     INEKernel::configure(win);
 }
 
-void NEGaussian5x5VertKernel::run(const Window &window)
+void NEGaussian5x5VertKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
index 52d1fbf..d6cb1b6 100644
--- a/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
+++ b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp

@@ -110,8 +110,9 @@
     INEKernel::configure(win);
 }
 
-void NEGaussianPyramidHorKernel::run(const Window &window)
+void NEGaussianPyramidHorKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(window.x().step() % 2);
@@ -215,8 +216,9 @@
     INEKernel::configure(win);
 }
 
-void NEGaussianPyramidVertKernel::run(const Window &window)
+void NEGaussianPyramidVertKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(window.x().step() != 16);

diff --git a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
index 404ad8a..3fd81be 100644
--- a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
+++ b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp

@@ -675,8 +675,9 @@
     INEKernel::configure(win);
 }
 
-void NEHOGOrientationBinningKernel::run(const Window &window)
+void NEHOGOrientationBinningKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
@@ -768,8 +769,9 @@
     INEKernel::configure(win);
 }
 
-void NEHOGBlockNormalizationKernel::run(const Window &window)
+void NEHOGBlockNormalizationKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);

diff --git a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
index 4af22bc..343b051 100644
--- a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
+++ b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp

@@ -92,8 +92,9 @@
     INEKernel::configure(win);
 }
 
-void NEHOGDetectorKernel::run(const Window &window)
+void NEHOGDetectorKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_hog_descriptor == nullptr);
@@ -176,7 +177,7 @@
                 win.idx_class = _idx_class;
                 win.score     = score;
 
-                std::unique_lock<std::mutex> lock(_mutex);
+                std::unique_lock<arm_compute::Mutex> lock(_mutex);
                 _detection_windows->push_back(win);
                 lock.unlock();
             }

diff --git a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
index 585676b..233b2ba 100644
--- a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp

@@ -287,8 +287,9 @@
 }
 
 template <int32_t block_size>
-void NEHarrisScoreFP16Kernel<block_size>::run(const Window &window)
+void NEHarrisScoreFP16Kernel<block_size>::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
@@ -360,7 +361,7 @@
     INEKernel::configure(win);
 }
 
-#endif
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
 
 template class arm_compute::NEHarrisScoreKernel<3>;
 template class arm_compute::NEHarrisScoreKernel<5>;
@@ -1029,8 +1030,9 @@
 }
 
 template <int32_t block_size>
-void NEHarrisScoreKernel<block_size>::run(const Window &window)
+void NEHarrisScoreKernel<block_size>::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);

diff --git a/src/core/NEON/kernels/NEHistogramKernel.cpp b/src/core/NEON/kernels/NEHistogramKernel.cpp
index 9e967ec..6e402ae 100644
--- a/src/core/NEON/kernels/NEHistogramKernel.cpp
+++ b/src/core/NEON/kernels/NEHistogramKernel.cpp

@@ -44,7 +44,7 @@
 
 inline void NEHistogramKernel::merge_histogram(uint32_t *global_hist, const uint32_t *local_hist, size_t bins)
 {
-    std::lock_guard<std::mutex> lock(_hist_mtx);
+    std::lock_guard<arm_compute::Mutex> lock(_hist_mtx);
 
     const unsigned int v_end = (bins / 4) * 4;
 
@@ -66,7 +66,7 @@
 {
 }
 
-void NEHistogramKernel::histogram_U8(Window win)
+void NEHistogramKernel::histogram_U8(Window win, const ThreadInfo &info)
 {
     ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
 
@@ -74,7 +74,7 @@
     const int32_t         offset     = _output->offset();
     const uint32_t        offrange   = offset + _output->range();
     const uint32_t *const w_lut      = _window_lut;
-    uint32_t *const       local_hist = _local_hist + win.thread_id() * bins;
+    uint32_t *const       local_hist = _local_hist + info.thread_id * bins;
 
     // Clear local_histogram
     std::fill_n(local_hist, bins, 0);
@@ -129,8 +129,9 @@
     merge_histogram(_output->buffer(), local_hist, bins);
 }
 
-void NEHistogramKernel::histogram_fixed_U8(Window win)
+void NEHistogramKernel::histogram_fixed_U8(Window win, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
 
     std::array<uint32_t, _max_range_size> local_hist{ { 0 } };
@@ -242,11 +243,11 @@
     INEKernel::configure(win);
 }
 
-void NEHistogramKernel::run(const Window &window)
+void NEHistogramKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (this->*_func)(window);
+    (this->*_func)(window, info);
 }

diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index c7c23d5..71910e3 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/FixedPoint.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
@@ -47,7 +48,8 @@
                              bool                 has_bias,
                              int                  top_left_x,
                              int                  top_left_y,
-                             int                  kernel_size,
+                             int                  kernel_width,
+                             int                  kernel_height,
                              int                  kernel_depth,
                              int                  input_w,
                              int                  input_h,
@@ -56,9 +58,9 @@
                              int                  input_stride_z,
                              int                  fixed_point_position)
 {
-    const int kernel_size2 = kernel_size * kernel_size;
-    const int x_e          = top_left_x + kernel_size;
-    const int y_e          = top_left_y + kernel_size;
+    const int kernel_size2 = kernel_width * kernel_height;
+    const int x_e          = top_left_x + kernel_width;
+    const int y_e          = top_left_y + kernel_height;
 
     // Linearize volume
     int d = 0;
@@ -109,8 +111,8 @@
             if((y < 0 || y >= input_h) && has_pads)
             {
                 // All the values will be zeros
-                memset(out_ptr, 0, kernel_size * sizeof(T));
-                out_ptr += kernel_size;
+                memset(out_ptr, 0, kernel_width * sizeof(T));
+                out_ptr += kernel_width;
             }
             else
             {
@@ -132,9 +134,13 @@
     // Append 1 if the convolution layer has biases
     if(has_bias)
     {
-        if(std::is_same<T, arm_compute::qint8_t>::value)
+        if(std::is_same<T, qint8_t>::value)
         {
-            *out_ptr = scvt_qs8_f32(1.0f, fixed_point_position);
+            *out_ptr = sqcvt_qs8_f32(1.0f, fixed_point_position);
+        }
+        else if(std::is_same<T, qint16_t>::value)
+        {
+            *out_ptr = sqcvt_qs16_f32(1.0f, fixed_point_position);
         }
         else
         {
@@ -199,7 +205,8 @@
                                       _has_bias,
                                       top_left_x,
                                       top_left_y,
-                                      static_cast<int>(_kernel_size),
+                                      static_cast<int>(_kernel_width),
+                                      static_cast<int>(_kernel_height),
                                       kernel_depth,
                                       input_w,
                                       input_h,
@@ -224,7 +231,7 @@
     in_window.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Window out_window;
-    out_window.use_tensor_dimensions(_output->info());
+    out_window.use_tensor_dimensions(_output->info()->tensor_shape());
     out_window.set(Window::DimX, Window::Dimension(out_window.x().start(), out_window.x().end(), in_width));
 
     Window in_slice  = in_window.first_slice_window_3D();
@@ -246,9 +253,13 @@
         // Add bias
         if(_has_bias)
         {
-            if(std::is_same<T, arm_compute::qint8_t>::value)
+            if(std::is_same<T, qint8_t>::value)
             {
-                *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = scvt_qs8_f32(1.0f, _input->info()->fixed_point_position());
+                *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = sqcvt_qs8_f32(1.0f, _input->info()->fixed_point_position());
+            }
+            else if(std::is_same<T, qint16_t>::value)
+            {
+                *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = sqcvt_qs16_f32(1.0f, _input->info()->fixed_point_position());
             }
             else
             {
@@ -260,24 +271,30 @@
 }
 
 NEIm2ColKernel::NEIm2ColKernel()
-    : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_size(0), _has_bias(false)
+    : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_width(0), _kernel_height(0), _has_bias(false)
 {
 }
 
-void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, std::pair<unsigned int, unsigned int> convolved_dims, const PadStrideInfo &conv_info, bool has_bias)
+void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QS8, DataType::QS16);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
 
     _input          = input;
     _output         = output;
-    _convolved_dims = convolved_dims;
     _conv_info      = conv_info;
-    _kernel_size    = std::sqrt((output->info()->dimension(0) - (has_bias ? 1 : 0)) / input->info()->dimension(2));
-    _has_bias       = has_bias;
+    _kernel_width   = kernel_dims.width;
+    _kernel_height  = kernel_dims.height,
+    _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
+                                        _kernel_width, _kernel_height,
+                                        _conv_info);
+    _has_bias = has_bias;
 
-    unsigned int pad_x, pad_y, stride_x, stride_y = 0;
+    unsigned int pad_x    = 0;
+    unsigned int pad_y    = 0;
+    unsigned int stride_x = 0;
+    unsigned int stride_y = 0;
     std::tie(pad_x, pad_y)       = conv_info.pad();
     std::tie(stride_x, stride_y) = conv_info.stride();
 
@@ -296,9 +313,17 @@
             case DataType::F32:
                 _func = &NEIm2ColKernel::run_reduced<float>;
                 break;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+            case DataType::F16:
+                _func = &NEIm2ColKernel::run_reduced<float16_t>;
+                break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
             case DataType::QS8:
                 _func = &NEIm2ColKernel::run_reduced<qint8_t>;
                 break;
+            case DataType::QS16:
+                _func = &NEIm2ColKernel::run_reduced<qint16_t>;
+                break;
             default:
                 ARM_COMPUTE_ERROR("Data type not supported");
                 break;
@@ -311,9 +336,17 @@
             case DataType::F32:
                 _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<float, false> : &NEIm2ColKernel::run_generic<float, true>;
                 break;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+            case DataType::F16:
+                _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<float16_t, false> : &NEIm2ColKernel::run_generic<float16_t, true>;
+                break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
             case DataType::QS8:
                 _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<qint8_t, false> : &NEIm2ColKernel::run_generic<qint8_t, true>;
                 break;
+            case DataType::QS16:
+                _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<qint16_t, false> : &NEIm2ColKernel::run_generic<qint16_t, true>;
+                break;
             default:
                 ARM_COMPUTE_ERROR("Data type not supported");
                 break;
@@ -329,8 +362,9 @@
     IKernel::configure(window);
 }
 
-void NEIm2ColKernel::run(const Window &window)
+void NEIm2ColKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/NEIntegralImageKernel.cpp b/src/core/NEON/kernels/NEIntegralImageKernel.cpp
index 3b09a1b..16a3cf7 100644
--- a/src/core/NEON/kernels/NEIntegralImageKernel.cpp
+++ b/src/core/NEON/kernels/NEIntegralImageKernel.cpp

@@ -71,8 +71,9 @@
     return false;
 }
 
-void NEIntegralImageKernel::run(const Window &window)
+void NEIntegralImageKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/NEL2NormalizeKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeKernel.cpp
new file mode 100644
index 0000000..12c532a
--- /dev/null
+++ b/src/core/NEON/kernels/NEL2NormalizeKernel.cpp

@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEL2NormalizeKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cmath>
+
+using namespace arm_compute;
+
+namespace
+{
+void l2_normalize_X(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
+{
+    Window window_sum(window);
+    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    Window in_slice  = window.first_slice_window_1D();
+    Window sum_slice = window_sum.first_slice_window_1D();
+
+    do
+    {
+        Iterator input_it(in, in_slice);
+        Iterator sum_it(sum, sum_slice);
+        Iterator output_it(out, in_slice);
+
+        const float       sum_value           = *reinterpret_cast<const float *>(sum_it.ptr());
+        const float32x4_t vec_normalize_value = vdupq_n_f32(1.f / std::sqrt(std::max(sum_value, epsilon)));
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto in_ptr  = reinterpret_cast<const float *>(input_it.ptr());
+            const auto out_ptr = reinterpret_cast<float *>(output_it.ptr());
+
+            vst1q_f32(out_ptr, vmulq_f32(vld1q_f32(in_ptr), vec_normalize_value));
+        },
+        input_it, output_it);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+}
+} // namespace
+
+NEL2NormalizeKernel::NEL2NormalizeKernel()
+    : _input(nullptr), _sum(nullptr), _output(nullptr), _axis(0), _epsilon(1e-12)
+{
+}
+
+void NEL2NormalizeKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output, unsigned int axis, float epsilon)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
+    ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Normalization axis greater than max number of dimensions");
+    ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported normalization axis, Supported axis is 0");
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, sum);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+    unsigned int num_elems_processed_per_iteration     = 16 / data_size_from_type(input->info()->data_type());
+    unsigned int num_elems_processed_per_iteration_sum = (axis == 0) ? 1 : num_elems_processed_per_iteration;
+
+    _input   = input;
+    _sum     = sum;
+    _output  = output;
+    _axis    = axis;
+    _epsilon = epsilon;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal sum_access(sum->info(), 0, num_elems_processed_per_iteration_sum);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, sum_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEL2NormalizeKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    switch(_axis)
+    {
+        case 0:
+            l2_normalize_X(_input, _sum, _output, _epsilon, window);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported normalization axis");
+    }
+}

diff --git a/src/core/NEON/kernels/NELKTrackerKernel.cpp b/src/core/NEON/kernels/NELKTrackerKernel.cpp
index 3d2bfb2..6fac797 100644
--- a/src/core/NEON/kernels/NELKTrackerKernel.cpp
+++ b/src/core/NEON/kernels/NELKTrackerKernel.cpp

@@ -385,8 +385,9 @@
     INEKernel::configure(window);
 }
 
-void NELKTrackerKernel::run(const Window &window)
+void NELKTrackerKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
index ab84efb..1b2942c 100644
--- a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp

@@ -49,15 +49,136 @@
 
 namespace
 {
-void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window)
+void vector_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info)
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+    const auto width_matrix_b  = static_cast<int>(output->info()->dimension(0));
+    const auto in_b_stride     = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
+    const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
+
+    // The implementation computes 16 elements per iteration
+    const int window_start_x = 16 * info.thread_id;
+    const int window_step_x  = 16 * info.num_threads;
+    // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+    const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+
+    Window win_out(window);
+    win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator ina(input0, win_a);
+    Iterator out(output, win_out);
+
+    execute_window_loop(win_out, [&](const Coordinates & id)
+    {
+        if(id.x() > width_matrix_b)
+        {
+            return;
+        }
+
+        float16x8_t acc0 = vdupq_n_f16(0.f);
+        float16x8_t acc1 = vdupq_n_f16(0.f);
+        float16x8_t acc2 = vdupq_n_f16(0.f);
+        float16x8_t acc3 = vdupq_n_f16(0.f);
+
+        auto vec_a    = reinterpret_cast<const float16_t *>(ina.ptr());
+        auto matrix_b = reinterpret_cast<const float16_t *>(input1->ptr_to_element(Coordinates(id[0], 0, id[1])));
+
+        const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
+
+        for(; vec_a <= (vec_a_end_addr - 4);)
+        {
+            const float16x4_t a0l = vld1_f16(vec_a);
+
+            float16x8_t b00 = vld1q_f16(matrix_b);
+            float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+            float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+            float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+
+            float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
+            float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
+            float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
+            float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
+
+            acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0));
+            acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0));
+            acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0));
+            acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0));
+            acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1));
+            acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1));
+            acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1));
+            acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1));
+
+            matrix_b += 2 * in_b_stride;
+
+            b00 = vld1q_f16(matrix_b);
+            b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+            b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+            b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+            b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
+            b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
+            b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
+            b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
+
+            acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2));
+            acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2));
+            acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2));
+            acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2));
+            acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3));
+            acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3));
+            acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3));
+            acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3));
+
+            vec_a += 4;
+            matrix_b += 2 * in_b_stride;
+        }
+
+        for(; vec_a < vec_a_end_addr;)
+        {
+            const float16_t   a0  = *vec_a;
+            const float16x8_t b00 = vld1q_f16(matrix_b);
+            const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+            const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+            const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+
+            acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0));
+            acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0));
+            acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0));
+            acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0));
+
+            vec_a += 1;
+            matrix_b += in_b_stride;
+        }
+
+        const auto vec_out = reinterpret_cast<float16_t *>(out.ptr());
+
+        vst1q_f16(vec_out + 0, acc0);
+        vst1q_f16(vec_out + 8, acc1);
+        vst1q_f16(vec_out + 16, acc2);
+        vst1q_f16(vec_out + 24, acc3);
+    },
+    ina, out);
+#else  /* ARM_COMPUTE_ENABLE_FP16 */
+    ARM_COMPUTE_UNUSED(input0);
+    ARM_COMPUTE_UNUSED(input1);
+    ARM_COMPUTE_UNUSED(output);
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR("Not supported, recompile with -march=armv8.2-a+fp16+simd.");
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+}
+
+void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info)
 {
     const auto width_matrix_b  = static_cast<int>(output->info()->dimension(0));
     const auto in_b_stride     = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
     const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
 
     // The implementation computes 16 elements per iteration
-    const int window_start_x = 16 * window.thread_id();
-    const int window_step_x  = 16 * window.num_threads();
+    const int window_start_x = 16 * info.thread_id;
+    const int window_step_x  = 16 * info.num_threads;
     // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
     const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
 
@@ -89,7 +210,7 @@
         asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
         asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
         asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
-#endif
+#endif /* __arm__ */
 
         const float *vec_a_end_addr = vec_a + num_elems_vec_a;
 
@@ -113,7 +234,7 @@
             asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
             asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
             asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
-#endif
+#endif /* __arm __ */
 
             acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
             acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
@@ -190,17 +311,17 @@
 
 void NELocallyConnectedMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
 
     _input0 = input0;
     _input1 = input1;
     _output = output;
 
-    unsigned int num_elems_processed_per_iteration_x = 16;
+    const unsigned int num_elems_processed_per_iteration_x = 16;
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
@@ -217,10 +338,27 @@
     INEKernel::configure(win);
 }
 
-void NELocallyConnectedMatrixMultiplyKernel::run(const Window &window)
+void NELocallyConnectedMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    vector_matrix_multiply_f32(_input0, _input1, _output, window);
+    switch(_input0->info()->data_type())
+    {
+        case DataType::F16:
+        {
+            vector_matrix_multiply_f16(_input0, _input1, _output, window, info);
+            break;
+        }
+        case DataType::F32:
+        {
+            vector_matrix_multiply_f32(_input0, _input1, _output, window, info);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Data type not supported");
+            break;
+        }
+    }
 }

diff --git a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
index a874d21..433985f 100644
--- a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
+++ b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp

@@ -415,8 +415,9 @@
 }
 
 template <MagnitudeType mag_type, PhaseType phase_type>
-void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::run(const Window &window)
+void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
@@ -428,7 +429,7 @@
 template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>;
 template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L1NORM, PhaseType::UNSIGNED>;
 template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>;
-#endif
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
 
 namespace
 {
@@ -854,8 +855,9 @@
 }
 
 template <MagnitudeType mag_type, PhaseType phase_type>
-void NEMagnitudePhaseKernel<mag_type, phase_type>::run(const Window &window)
+void NEMagnitudePhaseKernel<mag_type, phase_type>::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);

diff --git a/src/core/NEON/kernels/NEMeanStdDevKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
index 4616203..7895b00 100644
--- a/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevKernel.cpp

@@ -85,10 +85,15 @@
 } // namespace
 
 NEMeanStdDevKernel::NEMeanStdDevKernel()
-    : _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr), _mtx()
+    : _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr), _mtx(), _border_size(0)
 {
 }
 
+BorderSize NEMeanStdDevKernel::border_size() const
+{
+    return _border_size;
+}
+
 void NEMeanStdDevKernel::configure(const IImage *input, float *mean, uint64_t *global_sum, float *stddev, uint64_t *global_sum_squared)
 {
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
@@ -105,6 +110,8 @@
 
     constexpr unsigned int num_elems_processed_per_iteration = 16;
 
+    _border_size = BorderSize(ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration) - input->info()->dimension(0));
+
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
 
@@ -113,8 +120,9 @@
     INEKernel::configure(win);
 }
 
-void NEMeanStdDevKernel::run(const Window &window)
+void NEMeanStdDevKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     Iterator input(_input, window);
@@ -134,7 +142,7 @@
     const float num_pixels = _input->info()->dimension(0) * _input->info()->dimension(1);
 
     // Merge sum and calculate mean and stddev
-    std::unique_lock<std::mutex> lock(_mtx);
+    std::unique_lock<arm_compute::Mutex> lock(_mtx);
 
     *_global_sum += vget_lane_u64(local_sum, 0);
 

diff --git a/src/core/NEON/kernels/NEMedian3x3Kernel.cpp b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
index 601a0e1..54ef33e 100644
--- a/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp

@@ -75,8 +75,9 @@
     INEKernel::configure(win);
 }
 
-void NEMedian3x3Kernel::run(const Window &window)
+void NEMedian3x3Kernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
new file mode 100644
index 0000000..a81725f
--- /dev/null
+++ b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp

@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEMinMaxLayerKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <climits>
+#include <cstddef>
+
+namespace arm_compute
+{
+NEMinMaxLayerKernel::NEMinMaxLayerKernel()
+    : _input(nullptr), _output(nullptr), _mtx()
+{
+}
+
+void NEMinMaxLayerKernel::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    TensorShape output_shape{ input->info()->tensor_shape() };
+    output_shape.set(Window::DimX, 2);
+    output_shape.remove_dimension(1);
+    output_shape.remove_dimension(1);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, 2);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEMinMaxLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    const int x_start = window.x().start();
+    const int x_end   = window.x().end();
+
+    Window window_output;
+    window_output.use_tensor_dimensions(_output->info()->tensor_shape());
+    window_output.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Handle X dimension manually to split into two loops
+    // First one will use vector operations, second one processes the left over pixels
+    Window window_input(window);
+    window_input.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_input.collapse_if_possible(INEKernel::window(), 3);
+    window_input.set(3, Window::Dimension(0, 1, 1));
+
+    Iterator input(_input, window_input);
+    Iterator output(_output, window_output);
+
+    execute_window_loop(window_output, [&](const Coordinates & id_batch)
+    {
+        float32x2_t carry_min = vdup_n_f32(std::numeric_limits<float>::max());
+        float32x2_t carry_max = vdup_n_f32(std::numeric_limits<float>::lowest());
+
+        float carry_min_scalar = std::numeric_limits<float>::max();
+        float carry_max_scalar = std::numeric_limits<float>::lowest();
+
+        execute_window_loop(window_input, [&](const Coordinates & id)
+        {
+            int        x      = x_start;
+            const auto in_ptr = reinterpret_cast<const float *const>(input.ptr() + id_batch[1] * _input->info()->strides_in_bytes()[3]);
+
+            // Vector loop
+            for(; x <= x_end - 8; x += 8)
+            {
+                const float32x4x2_t pixels   = vld2q_f32(in_ptr + x);
+                const float32x4_t   tmp_min1 = vminq_f32(pixels.val[0], pixels.val[1]);
+                const float32x4_t   tmp_max1 = vmaxq_f32(pixels.val[0], pixels.val[1]);
+                const float32x2_t   tmp_min2 = vmin_f32(vget_high_f32(tmp_min1), vget_low_f32(tmp_min1));
+                const float32x2_t   tmp_max2 = vmax_f32(vget_high_f32(tmp_max1), vget_low_f32(tmp_max1));
+                carry_min                    = vmin_f32(tmp_min2, carry_min);
+                carry_max                    = vmax_f32(tmp_max2, carry_max);
+            }
+
+            // Process leftover pixels
+            for(; x < x_end; ++x)
+            {
+                const float pixel = in_ptr[x];
+                carry_min_scalar  = std::min(pixel, carry_min_scalar);
+                carry_max_scalar  = std::max(pixel, carry_max_scalar);
+            }
+        },
+        input);
+
+        // Reduce result
+        carry_min = vpmin_f32(carry_min, carry_min);
+        carry_max = vpmax_f32(carry_max, carry_max);
+        carry_min = vpmin_f32(carry_min, carry_min);
+        carry_max = vpmax_f32(carry_max, carry_max);
+
+        // Extract max/min values
+        const float min_i = std::min(vget_lane_f32(carry_min, 0), carry_min_scalar);
+        const float max_i = std::max(vget_lane_f32(carry_max, 0), carry_max_scalar);
+
+        auto out_ptr = reinterpret_cast<float *const>(output.ptr());
+
+        // Perform reduction of local min/max values
+        update_min_max(out_ptr, min_i, max_i);
+    },
+    output);
+}
+
+void NEMinMaxLayerKernel::reset()
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+
+    float32x2_t reset_values = vdup_n_f32(0.0f);
+    reset_values             = vset_lane_f32(std::numeric_limits<float>::max(), reset_values, 0);
+    reset_values             = vset_lane_f32(std::numeric_limits<float>::min(), reset_values, 1);
+
+    Window window_output;
+    window_output.use_tensor_dimensions(_output->info()->tensor_shape());
+    window_output.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator output(_output, window_output);
+
+    execute_window_loop(window_output, [&](const Coordinates & id)
+    {
+        vst1_f32(reinterpret_cast<float *const>(output.ptr()), reset_values);
+    },
+    output);
+}
+
+void NEMinMaxLayerKernel::update_min_max(float *out_ptr, float min, float max)
+{
+    std::lock_guard<Mutex> lock(_mtx);
+
+    const float32x2_t old_min = vld1_dup_f32(out_ptr);
+    const float32x2_t old_max = vld1_dup_f32(out_ptr + 1);
+    const float32x2_t new_min = vmin_f32(vdup_n_f32(min), old_min);
+    const float32x2_t new_max = vmax_f32(vdup_n_f32(max), old_max);
+
+    vst1_f32(out_ptr, vzip_f32(new_min, new_max).val[0]);
+}
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
index b188614..c7dc03c 100644
--- a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp

@@ -31,7 +31,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
 
+#include <algorithm>
 #include <arm_neon.h>
 #include <climits>
 #include <cstddef>
@@ -39,14 +41,14 @@
 namespace arm_compute
 {
 NEMinMaxKernel::NEMinMaxKernel()
-    : _func(), _input(nullptr), _min(), _max(), _min_init(), _max_init(), _mtx()
+    : _func(), _input(nullptr), _min(), _max(), _mtx()
 {
 }
 
-void NEMinMaxKernel::configure(const IImage *input, int32_t *min, int32_t *max)
+void NEMinMaxKernel::configure(const IImage *input, void *min, void *max)
 {
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
     ARM_COMPUTE_ERROR_ON(nullptr == min);
     ARM_COMPUTE_ERROR_ON(nullptr == max);
 
@@ -54,35 +56,33 @@
     _min   = min;
     _max   = max;
 
-    switch(input->info()->format())
+    switch(_input->info()->data_type())
     {
-        case Format::U8:
-            _min_init = UCHAR_MAX;
-            _max_init = 0;
-            _func     = &NEMinMaxKernel::minmax_U8;
+        case DataType::U8:
+            _func = &NEMinMaxKernel::minmax_U8;
             break;
-        case Format::S16:
-            _min_init = SHRT_MAX;
-            _max_init = SHRT_MIN;
-            _func     = &NEMinMaxKernel::minmax_S16;
+        case DataType::S16:
+            _func = &NEMinMaxKernel::minmax_S16;
+            break;
+        case DataType::F32:
+            _func = &NEMinMaxKernel::minmax_F32;
             break;
         default:
-            ARM_COMPUTE_ERROR("You called with the wrong img formats");
+            ARM_COMPUTE_ERROR("Unsupported data type");
             break;
     }
 
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
 
-    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
 
     INEKernel::configure(win);
 }
 
-void NEMinMaxKernel::run(const Window &window)
+void NEMinMaxKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
@@ -93,40 +93,85 @@
 void NEMinMaxKernel::reset()
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    *_min = _min_init;
-    *_max = _max_init;
+    switch(_input->info()->data_type())
+    {
+        case DataType::U8:
+            *static_cast<int32_t *>(_min) = UCHAR_MAX;
+            *static_cast<int32_t *>(_max) = 0;
+            break;
+        case DataType::S16:
+            *static_cast<int32_t *>(_min) = SHRT_MAX;
+            *static_cast<int32_t *>(_max) = SHRT_MIN;
+            break;
+        case DataType::F32:
+            *static_cast<float *>(_min) = std::numeric_limits<float>::max();
+            *static_cast<float *>(_max) = std::numeric_limits<float>::lowest();
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type");
+            break;
+    }
 }
 
 template <typename T>
 void NEMinMaxKernel::update_min_max(const T min, const T max)
 {
-    std::lock_guard<std::mutex> lock(_mtx);
+    std::lock_guard<arm_compute::Mutex> lock(_mtx);
 
-    if(min < *_min)
+    using type = typename std::conditional<std::is_same<T, float>::value, float, int32_t>::type;
+
+    auto min_ptr = static_cast<type *>(_min);
+    auto max_ptr = static_cast<type *>(_max);
+
+    if(min < *min_ptr)
     {
-        *_min = min;
+        *min_ptr = min;
     }
 
-    if(max > *_max)
+    if(max > *max_ptr)
     {
-        *_max = max;
+        *max_ptr = max;
     }
 }
 
-void NEMinMaxKernel::minmax_U8(const Window &win)
+void NEMinMaxKernel::minmax_U8(Window win)
 {
     uint8x8_t carry_min = vdup_n_u8(UCHAR_MAX);
     uint8x8_t carry_max = vdup_n_u8(0);
 
+    uint8_t carry_max_scalar = 0;
+    uint8_t carry_min_scalar = UCHAR_MAX;
+
+    const int x_start = win.x().start();
+    const int x_end   = win.x().end();
+
+    // Handle X dimension manually to split into two loops
+    // First one will use vector operations, second one processes the left over pixels
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
     Iterator input(_input, win);
 
     execute_window_loop(win, [&](const Coordinates & id)
     {
-        const uint8x16_t pixels  = vld1q_u8(input.ptr());
-        const uint8x8_t  tmp_min = vmin_u8(vget_high_u8(pixels), vget_low_u8(pixels));
-        const uint8x8_t  tmp_max = vmax_u8(vget_high_u8(pixels), vget_low_u8(pixels));
-        carry_min                = vmin_u8(tmp_min, carry_min);
-        carry_max                = vmax_u8(tmp_max, carry_max);
+        int x = x_start;
+
+        // Vector loop
+        for(; x <= x_end - 16; x += 16)
+        {
+            const uint8x16_t pixels  = vld1q_u8(input.ptr() + x);
+            const uint8x8_t  tmp_min = vmin_u8(vget_high_u8(pixels), vget_low_u8(pixels));
+            const uint8x8_t  tmp_max = vmax_u8(vget_high_u8(pixels), vget_low_u8(pixels));
+            carry_min                = vmin_u8(tmp_min, carry_min);
+            carry_max                = vmax_u8(tmp_max, carry_max);
+        }
+
+        // Process leftover pixels
+        for(; x < x_end; ++x)
+        {
+            const uint8_t pixel = input.ptr()[x];
+            carry_min_scalar    = std::min(pixel, carry_min_scalar);
+            carry_max_scalar    = std::max(pixel, carry_max_scalar);
+        }
     },
     input);
 
@@ -139,30 +184,55 @@
     carry_max = vpmax_u8(carry_max, carry_max);
 
     // Extract max/min values
-    const uint8_t min_i = vget_lane_u8(carry_min, 0);
-    const uint8_t max_i = vget_lane_u8(carry_max, 0);
+    const uint8_t min_i = std::min(vget_lane_u8(carry_min, 0), carry_min_scalar);
+    const uint8_t max_i = std::max(vget_lane_u8(carry_max, 0), carry_max_scalar);
 
     // Perform reduction of local min/max values
     update_min_max(min_i, max_i);
 }
 
-void NEMinMaxKernel::minmax_S16(const Window &win)
+void NEMinMaxKernel::minmax_S16(Window win)
 {
     int16x4_t carry_min = vdup_n_s16(SHRT_MAX);
     int16x4_t carry_max = vdup_n_s16(SHRT_MIN);
 
+    int16_t carry_max_scalar = SHRT_MIN;
+    int16_t carry_min_scalar = SHRT_MAX;
+
+    const int x_start = win.x().start();
+    const int x_end   = win.x().end();
+
+    // Handle X dimension manually to split into two loops
+    // First one will use vector operations, second one processes the left over pixels
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
     Iterator input(_input, win);
 
     execute_window_loop(win, [&](const Coordinates & id)
     {
-        const auto        in_ptr   = reinterpret_cast<const int16_t *>(input.ptr());
-        const int16x8x2_t pixels   = vld2q_s16(in_ptr);
-        const int16x8_t   tmp_min1 = vminq_s16(pixels.val[0], pixels.val[1]);
-        const int16x8_t   tmp_max1 = vmaxq_s16(pixels.val[0], pixels.val[1]);
-        const int16x4_t   tmp_min2 = vmin_s16(vget_high_s16(tmp_min1), vget_low_s16(tmp_min1));
-        const int16x4_t   tmp_max2 = vmax_s16(vget_high_s16(tmp_max1), vget_low_s16(tmp_max1));
-        carry_min                  = vmin_s16(tmp_min2, carry_min);
-        carry_max                  = vmax_s16(tmp_max2, carry_max);
+        int        x      = x_start;
+        const auto in_ptr = reinterpret_cast<const int16_t *const>(input.ptr());
+
+        // Vector loop
+        for(; x <= x_end - 16; x += 16)
+        {
+            const int16x8x2_t pixels   = vld2q_s16(in_ptr + x);
+            const int16x8_t   tmp_min1 = vminq_s16(pixels.val[0], pixels.val[1]);
+            const int16x8_t   tmp_max1 = vmaxq_s16(pixels.val[0], pixels.val[1]);
+            const int16x4_t   tmp_min2 = vmin_s16(vget_high_s16(tmp_min1), vget_low_s16(tmp_min1));
+            const int16x4_t   tmp_max2 = vmax_s16(vget_high_s16(tmp_max1), vget_low_s16(tmp_max1));
+            carry_min                  = vmin_s16(tmp_min2, carry_min);
+            carry_max                  = vmax_s16(tmp_max2, carry_max);
+        }
+
+        // Process leftover pixels
+        for(; x < x_end; ++x)
+        {
+            const int16_t pixel = in_ptr[x];
+            carry_min_scalar    = std::min(pixel, carry_min_scalar);
+            carry_max_scalar    = std::max(pixel, carry_max_scalar);
+        }
+
     },
     input);
 
@@ -173,15 +243,74 @@
     carry_max = vpmax_s16(carry_max, carry_max);
 
     // Extract max/min values
-    const int16_t min_i = vget_lane_s16(carry_min, 0);
-    const int16_t max_i = vget_lane_s16(carry_max, 0);
+    const int16_t min_i = std::min(vget_lane_s16(carry_min, 0), carry_min_scalar);
+    const int16_t max_i = std::max(vget_lane_s16(carry_max, 0), carry_max_scalar);
+
+    // Perform reduction of local min/max values
+    update_min_max(min_i, max_i);
+}
+
+void NEMinMaxKernel::minmax_F32(Window win)
+{
+    float32x2_t carry_min = vdup_n_f32(std::numeric_limits<float>::max());
+    float32x2_t carry_max = vdup_n_f32(std::numeric_limits<float>::lowest());
+
+    float carry_min_scalar = std::numeric_limits<float>::max();
+    float carry_max_scalar = std::numeric_limits<float>::lowest();
+
+    const int x_start = win.x().start();
+    const int x_end   = win.x().end();
+
+    // Handle X dimension manually to split into two loops
+    // First one will use vector operations, second one processes the left over pixels
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(_input, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        int        x      = x_start;
+        const auto in_ptr = reinterpret_cast<const float *const>(input.ptr());
+
+        // Vector loop
+        for(; x <= x_end - 8; x += 8)
+        {
+            const float32x4x2_t pixels   = vld2q_f32(in_ptr + x);
+            const float32x4_t   tmp_min1 = vminq_f32(pixels.val[0], pixels.val[1]);
+            const float32x4_t   tmp_max1 = vmaxq_f32(pixels.val[0], pixels.val[1]);
+            const float32x2_t   tmp_min2 = vmin_f32(vget_high_f32(tmp_min1), vget_low_f32(tmp_min1));
+            const float32x2_t   tmp_max2 = vmax_f32(vget_high_f32(tmp_max1), vget_low_f32(tmp_max1));
+            carry_min                    = vmin_f32(tmp_min2, carry_min);
+            carry_max                    = vmax_f32(tmp_max2, carry_max);
+        }
+
+        // Process leftover pixels
+        for(; x < x_end; ++x)
+        {
+            const float pixel = in_ptr[x];
+            carry_min_scalar  = std::min(pixel, carry_min_scalar);
+            carry_max_scalar  = std::max(pixel, carry_max_scalar);
+        }
+
+    },
+    input);
+
+    // Reduce result
+    carry_min = vpmin_f32(carry_min, carry_min);
+    carry_max = vpmax_f32(carry_max, carry_max);
+    carry_min = vpmin_f32(carry_min, carry_min);
+    carry_max = vpmax_f32(carry_max, carry_max);
+
+    // Extract max/min values
+    const float min_i = std::min(vget_lane_f32(carry_min, 0), carry_min_scalar);
+    const float max_i = std::max(vget_lane_f32(carry_max, 0), carry_max_scalar);
 
     // Perform reduction of local min/max values
     update_min_max(min_i, max_i);
 }
 
 NEMinMaxLocationKernel::NEMinMaxLocationKernel()
-    : _func(nullptr), _input(nullptr), _min(nullptr), _max(nullptr), _min_count(nullptr), _max_count(nullptr), _min_loc(nullptr), _max_loc(nullptr), _num_elems_processed_per_iteration(0)
+    : _func(nullptr), _input(nullptr), _min(nullptr), _max(nullptr), _min_count(nullptr), _max_count(nullptr), _min_loc(nullptr), _max_loc(nullptr)
 {
 }
 
@@ -222,12 +351,12 @@
     &NEMinMaxLocationKernel::minmax_loc<T, bool(N & 8), bool(N & 4), bool(N & 2), bool(N & 1)>...
 };
 
-void NEMinMaxLocationKernel::configure(const IImage *input, int32_t *min, int32_t *max,
+void NEMinMaxLocationKernel::configure(const IImage *input, void *min, void *max,
                                        ICoordinates2DArray *min_loc, ICoordinates2DArray *max_loc,
                                        uint32_t *min_count, uint32_t *max_count)
 {
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::U8, Format::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
     ARM_COMPUTE_ERROR_ON(nullptr == min);
     ARM_COMPUTE_ERROR_ON(nullptr == max);
 
@@ -246,31 +375,35 @@
 
     unsigned int table_idx = (count_min << 3) | (count_max << 2) | (loc_min << 1) | loc_max;
 
-    switch(input->info()->format())
+    switch(input->info()->data_type())
     {
-        case Format::U8:
+        case DataType::U8:
             _func = create_func_table<uint8_t, gen_index_seq<16>::type>::func_table[table_idx];
             break;
-        case Format::S16:
+        case DataType::S16:
             _func = create_func_table<int16_t, gen_index_seq<16>::type>::func_table[table_idx];
             break;
+        case DataType::F32:
+            _func = create_func_table<float, gen_index_seq<16>::type>::func_table[table_idx];
+            break;
         default:
-            ARM_COMPUTE_ERROR("You called with the wrong img formats");
+            ARM_COMPUTE_ERROR("Unsupported data type");
             break;
     }
 
-    _num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration));
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
 
-    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, _num_elems_processed_per_iteration));
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
 
     INEKernel::configure(win);
 }
 
-void NEMinMaxLocationKernel::run(const Window &window)
+void NEMinMaxLocationKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
@@ -285,9 +418,8 @@
     {
         Iterator input(_input, win);
 
-        size_t       min_count = 0;
-        size_t       max_count = 0;
-        unsigned int step      = _num_elems_processed_per_iteration;
+        size_t min_count = 0;
+        size_t max_count = 0;
 
         // Clear min location array
         if(loc_min)
@@ -301,46 +433,48 @@
             _max_loc->clear();
         }
 
+        using type = typename std::conditional<std::is_same<T, float>::value, float, int32_t>::type;
+
+        auto min_ptr = static_cast<type *>(_min);
+        auto max_ptr = static_cast<type *>(_max);
+
         execute_window_loop(win, [&](const Coordinates & id)
         {
             auto    in_ptr = reinterpret_cast<const T *>(input.ptr());
             int32_t idx    = id.x();
             int32_t idy    = id.y();
 
-            for(unsigned int i = 0; i < step; ++i)
+            const T       pixel = *in_ptr;
+            Coordinates2D p{ idx, idy };
+
+            if(count_min || loc_min)
             {
-                const T       pixel = *in_ptr++;
-                Coordinates2D p{ idx++, idy };
-
-                if(count_min || loc_min)
+                if(*min_ptr == pixel)
                 {
-                    if(*_min == pixel)
+                    if(count_min)
                     {
-                        if(count_min)
-                        {
-                            ++min_count;
-                        }
+                        ++min_count;
+                    }
 
-                        if(loc_min)
-                        {
-                            _min_loc->push_back(p);
-                        }
+                    if(loc_min)
+                    {
+                        _min_loc->push_back(p);
                     }
                 }
+            }
 
-                if(count_max || loc_max)
+            if(count_max || loc_max)
+            {
+                if(*max_ptr == pixel)
                 {
-                    if(*_max == pixel)
+                    if(count_max)
                     {
-                        if(count_max)
-                        {
-                            ++max_count;
-                        }
+                        ++max_count;
+                    }
 
-                        if(loc_max)
-                        {
-                            _max_loc->push_back(p);
-                        }
+                    if(loc_max)
+                    {
+                        _max_loc->push_back(p);
                     }
                 }
             }

diff --git a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
index 03d1409..ba68de6 100644
--- a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
+++ b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp

@@ -930,8 +930,9 @@
     input, output);
 }
 
-void NENonLinearFilterKernel::run(const Window &window)
+void NENonLinearFilterKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
index 1826c47..b7dfb59 100644
--- a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp

@@ -224,7 +224,7 @@
 
     INEKernel::configure(win);
 }
-#endif
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
 
 namespace
 {
@@ -495,8 +495,9 @@
     INEKernel::configure(win);
 }
 
-void NENonMaximaSuppression3x3Kernel::run(const Window &window)
+void NENonMaximaSuppression3x3Kernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);

diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index a971dc8..fc3f5f2 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp

@@ -46,15 +46,20 @@
 
 void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, input_squared);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, input_squared, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, input_squared, output);
     ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
-    ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.beta(), input);
-    ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.kappa(), input);
-    ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.scale_coeff(), input);
+    if(is_data_type_fixed_point(input->info()->data_type()))
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, input_squared, output);
+        ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.beta(), input);
+        ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.kappa(), input);
+        ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.scale_coeff(), input);
+    }
 
     const unsigned int border_width = (norm_info.type() == NormType::CROSS_MAP) ? 0 : std::min(norm_info.norm_size() / 2, 3U);
 
@@ -64,27 +69,101 @@
     _norm_info     = norm_info;
     _border_size   = BorderSize(0, border_width);
 
-    const bool is_dt_f32 = _input->info()->data_type() == DataType::F32;
+    unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+    ARM_COMPUTE_UNUSED(num_elems_processed_per_iteration);
 
-    switch(norm_info.type())
+    switch(_input->info()->data_type())
     {
-        case NormType::IN_MAP_1D:
-            _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<0, false> : &NENormalizationLayerKernel::normalize_fixed_point<0, false>;
+        case DataType::F32:
+        {
+            num_elems_processed_per_iteration = 4;
+            switch(norm_info.type())
+            {
+                case NormType::IN_MAP_1D:
+                    _func = &NENormalizationLayerKernel::normalize_float<DataType::F32, 0, false>;
+                    break;
+                case NormType::IN_MAP_2D:
+                    // Normalize over X and Y
+                    _func = &NENormalizationLayerKernel::normalize_float<DataType::F32, 0, true>;
+                    break;
+                case NormType::CROSS_MAP:
+                    _func = &NENormalizationLayerKernel::normalize_float<DataType::F32, 2, false>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
             break;
-        case NormType::IN_MAP_2D:
-            // Normalize over X and Y
-            _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<0, true> : &NENormalizationLayerKernel::normalize_fixed_point<0, true>;
+        }
+        case DataType::F16:
+        {
+            num_elems_processed_per_iteration = 8;
+            switch(norm_info.type())
+            {
+                case NormType::IN_MAP_1D:
+                    _func = &NENormalizationLayerKernel::normalize_float<DataType::F16, 0, false>;
+                    break;
+                case NormType::IN_MAP_2D:
+                    // Normalize over X and Y
+                    _func = &NENormalizationLayerKernel::normalize_float<DataType::F16, 0, true>;
+                    break;
+                case NormType::CROSS_MAP:
+                    _func = &NENormalizationLayerKernel::normalize_float<DataType::F16, 2, false>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
             break;
-        case NormType::CROSS_MAP:
-            _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<2, false> : &NENormalizationLayerKernel::normalize_fixed_point<2, false>;
+        }
+        case DataType::QS8:
+        {
+            num_elems_processed_per_iteration = 16;
+            switch(norm_info.type())
+            {
+                case NormType::IN_MAP_1D:
+                    _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS8, 0, false>;
+                    break;
+                case NormType::IN_MAP_2D:
+                    // Normalize over X and Y
+                    _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS8, 0, true>;
+                    break;
+                case NormType::CROSS_MAP:
+                    _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS8, 2, false>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
             break;
+        }
+        case DataType::QS16:
+        {
+            num_elems_processed_per_iteration = 8;
+            switch(norm_info.type())
+            {
+                case NormType::IN_MAP_1D:
+                    _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS16, 0, false>;
+                    break;
+                case NormType::IN_MAP_2D:
+                    // Normalize over X and Y
+                    _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS16, 0, true>;
+                    break;
+                case NormType::CROSS_MAP:
+                    _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS16, 2, false>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
         default:
             ARM_COMPUTE_ERROR("NOT SUPPORTED!");
     }
 
-    const unsigned int num_elems_processed_per_iteration = (is_dt_f32) ? 4 : 16;
-    const unsigned int num_elems_read_per_iteration      = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
-    const unsigned int num_rows                          = (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1;
+    const unsigned int num_elems_read_per_iteration = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
+    const unsigned int num_rows                     = (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1;
 
     // Configure window
     Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
@@ -100,8 +179,8 @@
     INEKernel::configure(win);
 }
 
-template <unsigned int dim, bool do_2D_norm>
-void NENormalizationLayerKernel::normalize(const Window &window)
+template <DataType dt, unsigned int dim, bool do_2D_norm>
+void NENormalizationLayerKernel::normalize_float(const Window &window)
 {
     Iterator input(_input, window);
     Iterator input_squared(_input_squared, window);
@@ -117,42 +196,86 @@
     const int min_top    = 0;
     const int max_bottom = _input->info()->dimension(dim_y) - 1;
 
-    const float32x4_t coeff_vec = vdupq_n_f32(_norm_info.scale_coeff());
-    const float32x4_t beta_vec  = vdupq_n_f32(_norm_info.beta());
-    const float32x4_t kappa_vec = vdupq_n_f32(_norm_info.kappa());
-
-    execute_window_loop(window, [&](const Coordinates & id)
+    if(dt == DataType::F32)
     {
-        // Get range to normalize
-        const int current_row   = do_2D_norm ? id[dim_y] : 0;
-        const int current_slice = id[dim];
-        const int first_row     = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
-        const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
-        const int first_slice   = std::max(current_slice - radius, min_left);
-        const int last_slice    = std::min(current_slice + radius, max_right);
+        const float32x4_t coeff_vec = vdupq_n_f32(_norm_info.scale_coeff());
+        const float32x4_t beta_vec  = vdupq_n_f32(_norm_info.beta());
+        const float32x4_t kappa_vec = vdupq_n_f32(_norm_info.kappa());
 
-        // Accumulate 2D In-Map values
-        float32x4_t accu = vdupq_n_f32(0.f);
-        for(int j = first_row; j <= last_row; j++)
+        execute_window_loop(window, [&](const Coordinates & id)
         {
-            // Compute row displacement
-            const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
-            const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
-            for(int i = first_slice; i <= last_slice; ++i)
-            {
-                accu = vaddq_f32(accu, vld1q_f32(reinterpret_cast<const float *>(input_squared_ptr + i * input_squared_stride)));
-            }
-        }
+            // Get range to normalize
+            const int current_row   = do_2D_norm ? id[dim_y] : 0;
+            const int current_slice = id[dim];
+            const int first_row     = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+            const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+            const int first_slice   = std::max(current_slice - radius, min_left);
+            const int last_slice    = std::min(current_slice + radius, max_right);
 
-        // Normalize
-        const float32x4_t normalized       = vpowq_f32(vmlaq_f32(kappa_vec, coeff_vec, accu), beta_vec);
-        const float32x4_t normalized_pixel = vmulq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), vinvq_f32(normalized));
-        vst1q_f32(reinterpret_cast<float *>(output.ptr()), normalized_pixel);
-    },
-    input, input_squared, output);
+            // Accumulate 2D In-Map values
+            float32x4_t accu = vdupq_n_f32(0.f);
+            for(int j = first_row; j <= last_row; j++)
+            {
+                // Compute row displacement
+                const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+                const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+                for(int i = first_slice; i <= last_slice; ++i)
+                {
+                    accu = vaddq_f32(accu, vld1q_f32(reinterpret_cast<const float *>(input_squared_ptr + i * input_squared_stride)));
+                }
+            }
+
+            // Normalize
+            const float32x4_t normalized       = vpowq_f32(vmlaq_f32(kappa_vec, coeff_vec, accu), beta_vec);
+            const float32x4_t normalized_pixel = vmulq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), vinvq_f32(normalized));
+            vst1q_f32(reinterpret_cast<float *>(output.ptr()), normalized_pixel);
+        },
+        input, input_squared, output);
+    }
+#ifdef ARM_COMPUTE_ENABLE_FP16
+    else if(dt == DataType::F16)
+    {
+        const float16x8_t coeff_vec    = vdupq_n_f16(_norm_info.scale_coeff());
+        const float16x8_t beta_vec_f16 = vdupq_n_f16(_norm_info.beta());
+        const float16x8_t kappa_vec    = vdupq_n_f16(_norm_info.kappa());
+
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Get range to normalize
+            const int current_row   = do_2D_norm ? id[dim_y] : 0;
+            const int current_slice = id[dim];
+            const int first_row     = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+            const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+            const int first_slice   = std::max(current_slice - radius, min_left);
+            const int last_slice    = std::min(current_slice + radius, max_right);
+
+            // Accumulate 2D In-Map values
+            float16x8_t accu = vdupq_n_f16(0.f);
+            for(int j = first_row; j <= last_row; j++)
+            {
+                // Compute row displacement
+                const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+                const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+                for(int i = first_slice; i <= last_slice; ++i)
+                {
+                    accu = vaddq_f16(accu, vld1q_f16(reinterpret_cast<const float16_t *>(input_squared_ptr + i * input_squared_stride)));
+                }
+            }
+
+            const float16x8_t norm_f16         = vpowq_f16(vaddq_f16(kappa_vec, vmulq_f16(coeff_vec, accu)), beta_vec_f16);
+            const float16x8_t normalized_pixel = vmulq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), vinvq_f16(norm_f16));
+            vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), normalized_pixel);
+        },
+        input, input_squared, output);
+    }
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+    else
+    {
+        ARM_COMPUTE_ERROR("Not supported");
+    }
 }
 
-template <unsigned int dim, bool do_2D_norm>
+template <DataType dt, unsigned int dim, bool do_2D_norm>
 void NENormalizationLayerKernel::normalize_fixed_point(const Window &window)
 {
     Iterator input(_input, window);
@@ -171,44 +294,89 @@
 
     const int fixed_point_position = _input->info()->fixed_point_position();
 
-    const qint8x16_t coeff_vec = vdupq_n_qs8_f32(_norm_info.scale_coeff(), fixed_point_position);
-    const qint8x16_t beta_vec  = vdupq_n_qs8_f32(_norm_info.beta(), fixed_point_position);
-    const qint8x16_t kappa_vec = vdupq_n_qs8_f32(_norm_info.kappa(), fixed_point_position);
-
-    execute_window_loop(window, [&](const Coordinates & id)
+    if(dt == DataType::QS8)
     {
-        // Get range to normalize
-        const int current_row   = do_2D_norm ? id[dim_y] : 0;
-        const int current_slice = id[dim];
-        const int first_row     = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
-        const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
-        const int first_slice   = std::max(current_slice - radius, min_left);
-        const int last_slice    = std::min(current_slice + radius, max_right);
+        const qint8x16_t coeff_vec = vdupq_n_qs8_f32(_norm_info.scale_coeff(), fixed_point_position);
+        const qint8x16_t beta_vec  = vdupq_n_qs8_f32(_norm_info.beta(), fixed_point_position);
+        const qint8x16_t kappa_vec = vdupq_n_qs8_f32(_norm_info.kappa(), fixed_point_position);
 
-        // Accumulate 2D In-Map values
-        qint8x16_t accu = vdupq_n_qs8(0);
-        for(int j = first_row; j <= last_row; ++j)
+        execute_window_loop(window, [&](const Coordinates & id)
         {
-            // Compute row displacement
-            const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
-            const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
-            for(int i = first_slice; i <= last_slice; ++i)
-            {
-                accu = vqaddq_qs8(accu, vld1q_qs8(reinterpret_cast<const qint8_t *>(input_squared_ptr + i * input_squared_stride)));
-            }
-        }
+            // Get range to normalize
+            const int current_row   = do_2D_norm ? id[dim_y] : 0;
+            const int current_slice = id[dim];
+            const int first_row     = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+            const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+            const int first_slice   = std::max(current_slice - radius, min_left);
+            const int last_slice    = std::min(current_slice + radius, max_right);
 
-        // Normalize
-        const qint8x16_t accu_scale       = vqmlaq_qs8(kappa_vec, coeff_vec, accu, fixed_point_position);
-        const qint8x16_t normalized       = vqpowq_qs8(accu_scale, beta_vec, fixed_point_position);
-        const qint8x16_t normalized_pixel = vdivq_qs8(vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr())), normalized, fixed_point_position);
-        vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), normalized_pixel);
-    },
-    input, input_squared, output);
+            // Accumulate 2D In-Map values
+            qint8x16_t accu = vdupq_n_qs8(0);
+            for(int j = first_row; j <= last_row; ++j)
+            {
+                // Compute row displacement
+                const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+                const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+                for(int i = first_slice; i <= last_slice; ++i)
+                {
+                    accu = vqaddq_qs8(accu, vld1q_qs8(reinterpret_cast<const qint8_t *>(input_squared_ptr + i * input_squared_stride)));
+                }
+            }
+
+            // Normalize
+            const qint8x16_t accu_scale       = vqmlaq_qs8(kappa_vec, coeff_vec, accu, fixed_point_position);
+            const qint8x16_t normalized       = vqpowq_qs8(accu_scale, beta_vec, fixed_point_position);
+            const qint8x16_t normalized_pixel = vdivq_qs8(vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr())), normalized, fixed_point_position);
+            vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), normalized_pixel);
+        },
+        input, input_squared, output);
+    }
+    else if(dt == DataType::QS16)
+    {
+        const qint16x8_t coeff_vec = vdupq_n_qs16_f32(_norm_info.scale_coeff(), fixed_point_position);
+        const qint16x8_t beta_vec  = vdupq_n_qs16_f32(_norm_info.beta(), fixed_point_position);
+        const qint16x8_t kappa_vec = vdupq_n_qs16_f32(_norm_info.kappa(), fixed_point_position);
+
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Get range to normalize
+            const int current_row   = do_2D_norm ? id[dim_y] : 0;
+            const int current_slice = id[dim];
+            const int first_row     = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+            const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+            const int first_slice   = std::max(current_slice - radius, min_left);
+            const int last_slice    = std::min(current_slice + radius, max_right);
+
+            // Accumulate 2D In-Map values
+            qint16x8_t accu = vdupq_n_qs16(0);
+            for(int j = first_row; j <= last_row; ++j)
+            {
+                // Compute row displacement
+                const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+                const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+                for(int i = first_slice; i <= last_slice; ++i)
+                {
+                    accu = vqaddq_qs16(accu, vld1q_qs16(reinterpret_cast<const qint16_t *>(input_squared_ptr + i * input_squared_stride)));
+                }
+            }
+
+            // Normalize
+            const qint16x8_t accu_scale       = vqmlaq_qs16(kappa_vec, coeff_vec, accu, fixed_point_position);
+            const qint16x8_t normalized       = vqpowq_qs16(accu_scale, beta_vec, fixed_point_position);
+            const qint16x8_t normalized_pixel = vdivq_qs16(vld1q_qs16(reinterpret_cast<const qint16_t *>(input.ptr())), normalized, fixed_point_position);
+            vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), normalized_pixel);
+        },
+        input, input_squared, output);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not supported");
+    }
 }
 
-void NENormalizationLayerKernel::run(const Window &window)
+void NENormalizationLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);

diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
index aa8c7a1..19d45e2 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp

@@ -38,6 +38,10 @@
 #include <cstdint>
 #include <cstdlib>
 
+#if ARM_COMPUTE_ENABLE_FP16
+#include <arm_fp16.h> // needed for float16_t
+#endif                /* ARM_COMPUTE_ENABLE_FP16 */
+
 using namespace arm_compute;
 
 namespace arm_compute
@@ -127,20 +131,100 @@
 template <bool is_scale255, bool is_sat>
 void mul_QS8_QS8_QS8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n, int fixed_point_position)
 {
-    // n is the exponent of the scaling factor, that is scale = 1/2^n. Currently, we only support scaling factor equal to 1 => n = 0.
-    ARM_COMPUTE_ERROR_ON_MSG(n != 0, "Scaling factor different than 1 not supported for 8-bit fixed-point pixel-wise multiplication");
-    ARM_COMPUTE_UNUSED(n);
-
-    const auto input1 = static_cast<const qint8_t *__restrict>(input1_ptr);
-    const auto input2 = static_cast<const qint8_t *__restrict>(input2_ptr);
     const auto output = static_cast<qint8_t *__restrict>(output_ptr);
 
-    const qint8x16_t ta1 = vld1q_qs8(input1);
-    const qint8x16_t ta2 = vld1q_qs8(input2);
+    const qint8x16_t ta1 = vld1q_qs8(static_cast<const qint8_t *__restrict>(input1_ptr));
+    const qint8x16_t ta2 = vld1q_qs8(static_cast<const qint8_t *__restrict>(input2_ptr));
 
-    qint8x16_t res = (is_sat) ? vqmulq_qs8(ta1, ta2, fixed_point_position) : vmulq_qs8(ta1, ta2, fixed_point_position);
+    if(is_scale255)
+    {
+        qint16x8_t       tmp1_high = vmovl_s8(vget_high_s8(ta1));
+        qint16x8_t       tmp1_low  = vmovl_s8(vget_low_s8(ta1));
+        const qint16x8_t tmp2_high = vmovl_s8(vget_high_s8(ta2));
+        const qint16x8_t tmp2_low  = vmovl_s8(vget_low_s8(ta2));
 
-    vst1q_s8(output, res);
+        const float32x4x2_t scale255_f32 =
+        {
+            {
+                scale255_constant_f32q,
+                scale255_constant_f32q
+            }
+        };
+        const qint16x8_t scale255 = vqcvtq_qs16_f32(scale255_f32, fixed_point_position);
+
+        tmp1_high = vmulq_qs16(tmp1_high, tmp2_high, fixed_point_position);
+        tmp1_low  = vmulq_qs16(tmp1_low, tmp2_low, fixed_point_position);
+        tmp1_high = vmulq_qs16(tmp1_high, scale255, fixed_point_position);
+        tmp1_low  = vmulq_qs16(tmp1_low, scale255, fixed_point_position);
+
+        if(is_sat)
+        {
+            vst1q_qs8(output, vcombine_s8(vqmovn_s16(tmp1_low), vqmovn_s16(tmp1_high)));
+        }
+        else
+        {
+            vst1q_qs8(output, vcombine_s8(vmovn_s16(tmp1_low), vmovn_s16(tmp1_high)));
+        }
+    }
+    else
+    {
+        const qint8x16_t vn  = vdupq_n_s8(-n);
+        qint8x16_t       res = ta2;
+
+        if(is_sat)
+        {
+            res = vqshlq_s8(vqmulq_qs8(ta1, res, fixed_point_position), vn);
+        }
+        else
+        {
+            res = vshlq_s8(vmulq_qs8(ta1, res, fixed_point_position), vn);
+        }
+        vst1q_qs8(output, res);
+    }
+}
+
+template <bool is_scale255, bool is_sat>
+void mul_QS16_QS16_QS16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n, int fixed_point_position)
+{
+    const qint16x8x2_t ta1 = vld2q_qs16(static_cast<const qint16_t *__restrict>(input1_ptr));
+    qint16x8x2_t       res = vld2q_qs16(static_cast<const qint16_t *__restrict>(input2_ptr));
+
+    if(is_scale255)
+    {
+        const float32x4x2_t scale255_f32 =
+        {
+            {
+                scale255_constant_f32q,
+                scale255_constant_f32q
+            }
+        };
+        const qint16x8_t scale255 = vqcvtq_qs16_f32(scale255_f32, fixed_point_position);
+        if(is_sat)
+        {
+            res.val[0] = vqmulq_qs16(vqmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), scale255, fixed_point_position);
+            res.val[1] = vqmulq_qs16(vqmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), scale255, fixed_point_position);
+        }
+        else
+        {
+            res.val[0] = vmulq_qs16(vmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), scale255, fixed_point_position);
+            res.val[1] = vmulq_qs16(vmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), scale255, fixed_point_position);
+        }
+    }
+    else
+    {
+        const qint16x8_t vn = vdupq_n_s16(-n);
+        if(is_sat)
+        {
+            res.val[0] = vqshlq_s16(vqmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), vn);
+            res.val[1] = vqshlq_s16(vqmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), vn);
+        }
+        else
+        {
+            res.val[0] = vshlq_s16(vmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), vn);
+            res.val[1] = vshlq_s16(vmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), vn);
+        }
+    }
+    vst2q_s16(static_cast<qint16_t *__restrict>(output_ptr), res);
 }
 
 template <bool is_scale255, bool is_sat>
@@ -249,6 +333,33 @@
 }
 
 template <bool is_scale255, bool is_sat>
+void mul_F16_F16_F16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale)
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+    const auto          input1    = static_cast<const float16_t *__restrict>(input1_ptr);
+    const auto          input2    = static_cast<const float16_t *__restrict>(input2_ptr);
+    const auto          output    = static_cast<float16_t *__restrict>(output_ptr);
+    const float16x8x2_t ta1       = vld2q_f16(input1);
+    const float16x8x2_t ta2       = vld2q_f16(input2);
+    const float16x8_t   scale_vec = vdupq_n_f16(scale);
+    const float16x8x2_t result =
+    {
+        {
+            vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
+            vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
+        }
+    };
+    vst2q_f16(output, result);
+#else  /* ARM_COMPUTE_ENABLE_FP16 */
+    ARM_COMPUTE_UNUSED(input1_ptr);
+    ARM_COMPUTE_UNUSED(input2_ptr);
+    ARM_COMPUTE_UNUSED(output_ptr);
+    ARM_COMPUTE_UNUSED(scale);
+    ARM_COMPUTE_ERROR("Not supported. Recompile the library with arch=arm64-v8.2-a.");
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+}
+
+template <bool is_scale255, bool is_sat>
 void mul_U8_U8_S16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
 {
     const auto input1 = static_cast<const uint8_t *__restrict>(input1_ptr);
@@ -333,16 +444,43 @@
 
 void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    // Auto initialize output if not initialized
+    {
+        set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+        if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+        {
+            set_format_if_unknown(*output->info(), Format::S16);
+        }
+        else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+        {
+            set_format_if_unknown(*output->info(), Format::F32);
+        }
+        else if(input1->info()->data_type() == DataType::F16 || input2->info()->data_type() == DataType::F16)
+        {
+            set_format_if_unknown(*output->info(), Format::F16);
+        }
+        else if(input1->info()->data_type() == DataType::QS8 && input2->info()->data_type() == DataType::QS8)
+        {
+            set_data_type_if_unknown(*output->info(), DataType::QS8);
+            set_fixed_point_position_if_zero(*output->info(), input1->info()->fixed_point_position());
+        }
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
                              "Output can only be U8 if both inputs are U8");
-    if(output->info()->data_type() == DataType::QS8 || input1->info()->data_type() == DataType::QS8 || output->info()->data_type() == DataType::QS8)
+    if(is_data_type_fixed_point(input1->info()->data_type()) || is_data_type_fixed_point(input2->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type()))
     {
-        // All data types must be QS8
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input1, input2, output);
+        // Check that all data types are the same and all fixed-point positions are the same
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
+        // Check if scale is representable in fixed-point with the provided settings
+        ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(scale, input1);
     }
 
     _input1         = input1;
@@ -457,6 +595,22 @@
             _func_q_int = is_sat ? &mul_QS8_QS8_QS8_n<false, true> : &mul_QS8_QS8_QS8_n<false, false>;
         }
     }
+    else if(DataType::QS16 == dt_input1 && DataType::QS16 == dt_input2 && DataType::QS16 == dt_output)
+    {
+        if(is_scale_255)
+        {
+            _func_q_int = is_sat ? &mul_QS16_QS16_QS16_n<true, true> : &mul_QS16_QS16_QS16_n<true, false>;
+        }
+        else
+        {
+            _func_q_int = is_sat ? &mul_QS16_QS16_QS16_n<false, true> : &mul_QS16_QS16_QS16_n<false, false>;
+        }
+    }
+    else if(DataType::F16 == dt_input1 && DataType::F16 == dt_input2 && DataType::F16 == dt_output)
+    {
+        _func_float = &mul_F16_F16_F16_n<false, false>;
+        _func_int   = nullptr;
+    }
     else if(DataType::F32 == dt_input1 && DataType::F32 == dt_input2 && DataType::F32 == dt_output)
     {
         _func_float = &mul_F32_F32_F32_n<false, false>;
@@ -486,8 +640,9 @@
     INEKernel::configure(win);
 }
 
-void NEPixelWiseMultiplicationKernel::run(const Window &window)
+void NEPixelWiseMultiplicationKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 30b67b6..b97564e 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp

@@ -29,6 +29,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
@@ -36,7 +37,9 @@
 
 #include <algorithm>
 #include <arm_neon.h>
+#include <cmath>
 #include <limits>
+#include <set>
 #include <string>
 #include <tuple>
 
@@ -47,24 +50,37 @@
 inline float calculate_avg_scale(const Coordinates &id, const int pool_size, const int upper_bound_w, const int upper_bound_h,
                                  const int pad_x, const int pad_y, const int stride_x, const int stride_y)
 {
-    int start_x = id.x() * stride_x - pad_x;
-    int start_y = id.y() * stride_y - pad_y;
-    int end_x   = std::min(start_x + pool_size, upper_bound_w);
-    int end_y   = std::min(start_y + pool_size, upper_bound_h);
+    const int start_x = id.x() * stride_x - pad_x;
+    const int start_y = id.y() * stride_y - pad_y;
+    const int end_x   = std::min(start_x + pool_size, upper_bound_w);
+    const int end_y   = std::min(start_y + pool_size, upper_bound_h);
     return 1.f / ((end_y - start_y) * (end_x - start_x));
 }
 
 inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
                                       int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
 {
-    static std::array<qint8_t, 10> scale_values_q8 =
+    static const std::array<qint8_t, 10> scale_values_q8 =
     { { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } };
     const int start_x = id.x() * stride_x - pad_x;
     const int start_y = id.y() * stride_y - pad_y;
     const int end_x   = std::min(start_x + pool_size, upper_bound_w);
     const int end_y   = std::min(start_y + pool_size, upper_bound_h);
     const int val     = ((end_y - start_y) * (end_x - start_x));
-    return scale_values_q8[val] >> (7 - fixed_point_position);
+    return sshr_qs8(scale_values_q8[val], (7 - fixed_point_position));
+}
+
+inline qint16_t calculate_avg_scale_q16(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
+                                        int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
+{
+    static std::array<qint16_t, 10> scale_values_q16 =
+    { { 0x0, 0x0, 0x4000, 0x2AAB, 0x2000, 0x199A, 0x1555, 0x1249, 0x1000, 0xE38 } };
+    const int start_x = id.x() * stride_x - pad_x;
+    const int start_y = id.y() * stride_y - pad_y;
+    const int end_x   = std::min(start_x + pool_size, upper_bound_w);
+    const int end_y   = std::min(start_y + pool_size, upper_bound_h);
+    const int val     = ((end_y - start_y) * (end_x - start_x));
+    return sshr_qs16(scale_values_q16[val], (15 - fixed_point_position));
 }
 } // namespace
 
@@ -80,34 +96,44 @@
 
 void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
 {
-    int                   pool_pad_x      = 0;
-    int                   pool_pad_y      = 0;
-    int                   pool_stride_x   = 0;
-    int                   pool_stride_y   = 0;
-    unsigned int          pooled_w        = 0;
-    unsigned int          pooled_h        = 0;
-    PoolingType           pool_type       = pool_info.pool_type();
-    int                   pool_size       = pool_info.pool_size();
-    const PadStrideInfo   pad_stride_info = pool_info.pad_stride_info();
-    DimensionRoundingType pool_round      = pad_stride_info.round();
+    int                 pool_pad_x      = 0;
+    int                 pool_pad_y      = 0;
+    int                 pool_stride_x   = 0;
+    int                 pool_stride_y   = 0;
+    unsigned int        pooled_w        = 0;
+    unsigned int        pooled_h        = 0;
+    PoolingType         pool_type       = pool_info.pool_type();
+    int                 pool_size       = pool_info.pool_size();
+    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
     std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
 
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
-    ARM_COMPUTE_ERROR_ON(2 != pool_size && 3 != pool_size);
+    static const std::set<int> supported_pool_sizes = { 2, 3, 7 };
+    ARM_COMPUTE_UNUSED(supported_pool_sizes);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_fixed_point(input->info()->data_type()));
+    ARM_COMPUTE_ERROR_ON(supported_pool_sizes.find(pool_size) == supported_pool_sizes.end());
+    ARM_COMPUTE_ERROR_ON(7 == pool_size && input->info()->data_type() != DataType::F32);
     ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
-    ARM_COMPUTE_ERROR_ON(input->info()->data_type() == DataType::QS8 && pool_type == PoolingType::AVG && input->info()->fixed_point_position() > 6);
-    ARM_COMPUTE_ERROR_ON(input->info()->data_type() == DataType::QS8 && pool_stride_x > 2);
+    ARM_COMPUTE_ERROR_ON(is_data_type_fixed_point(input->info()->data_type()) && pool_stride_x > 2);
 
     // Check output dimensions
     std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
-                                                     pool_size, pool_stride_x, pool_stride_y,
-                                                     pool_pad_x, pool_pad_y, pool_round);
-    ARM_COMPUTE_UNUSED(pooled_w);
-    ARM_COMPUTE_UNUSED(pooled_h);
+                                                     pool_size, pool_size, pool_info.pad_stride_info());
+
+    // Output auto initialization if not yet initialized
+    {
+        TensorShape output_shape{ input->info()->tensor_shape() };
+        output_shape.set(0, pooled_w);
+        output_shape.set(1, pooled_h);
+
+        auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h));
 
     unsigned int num_elems_read_per_iteration      = 0;
@@ -118,12 +144,72 @@
     switch(input->info()->data_type())
     {
         case DataType::QS8:
-            num_elems_read_per_iteration      = 16;
-            num_elems_processed_per_iteration = (pool_size == 2) ? 8 : 7;
-            num_elems_horizontal_window       = 8;
+            num_elems_read_per_iteration = 16;
+            switch(pool_size)
+            {
+                case 2:
+                    num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
+                    break;
+                case 3:
+                    num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Pooling size not supported");
+                    break;
+            }
+            num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
             break;
+        case DataType::QS16:
+            num_elems_read_per_iteration = 8;
+            switch(pool_size)
+            {
+                case 2:
+                    num_elems_processed_per_iteration = (pool_stride_x == 2) ? 4 : 7;
+                    break;
+                case 3:
+                    num_elems_processed_per_iteration = (pool_stride_x == 2) ? 3 : 6;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Pooling size not supported");
+            }
+            num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8;
+            break;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+        case DataType::F16:
+            switch(pool_size)
+            {
+                case 2:
+                    num_elems_read_per_iteration      = 16;
+                    num_elems_processed_per_iteration = 8;
+                    num_elems_horizontal_window       = 8;
+                    break;
+                case 3:
+                    num_elems_read_per_iteration      = 4;
+                    num_elems_processed_per_iteration = 1;
+                    num_elems_horizontal_window       = 1;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Pooling size not supported");
+                    break;
+            }
+            break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
         case DataType::F32:
-            num_elems_read_per_iteration      = (pool_size == 2) ? 2 : 4; // We use vload4 for pooling3
+            switch(pool_size)
+            {
+                case 2:
+                    num_elems_read_per_iteration = 2;
+                    break;
+                case 3:
+                    num_elems_read_per_iteration = 4; // We use vload4 for pooling3
+                    break;
+                case 7:
+                    num_elems_read_per_iteration = 8; // We use vload8 for pooling7
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Pooling size not supported");
+                    break;
+            }
             num_elems_processed_per_iteration = 1;
             num_elems_horizontal_window       = 1;
             break;
@@ -152,21 +238,145 @@
         case 2:
             if(input->info()->data_type() == DataType::QS8)
             {
-                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
+                switch(pool_type)
+                {
+                    case PoolingType::AVG:
+                        _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG>;
+                        break;
+                    case PoolingType::MAX:
+                        _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
+            }
+            else if(input->info()->data_type() == DataType::QS16)
+            {
+                switch(pool_type)
+                {
+                    case PoolingType::AVG:
+                        _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::AVG>;
+                        break;
+                    case PoolingType::MAX:
+                        _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::MAX>;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
+            }
+            else if(input->info()->data_type() == DataType::F16)
+            {
+                switch(pool_type)
+                {
+                    case PoolingType::AVG:
+                        _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG>;
+                        break;
+                    case PoolingType::L2:
+                        _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2>;
+                        break;
+                    case PoolingType::MAX:
+                        _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX>;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
             }
             else if(input->info()->data_type() == DataType::F32)
             {
-                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX>;
+                switch(pool_type)
+                {
+                    case PoolingType::AVG:
+                        _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG>;
+                        break;
+                    case PoolingType::L2:
+                        _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2>;
+                        break;
+                    case PoolingType::MAX:
+                        _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX>;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
             }
             break;
         case 3:
             if(input->info()->data_type() == DataType::QS8)
             {
-                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
+                switch(pool_type)
+                {
+                    case PoolingType::AVG:
+                        _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG>;
+                        break;
+                    case PoolingType::MAX:
+                        _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
+            }
+            else if(input->info()->data_type() == DataType::QS16)
+            {
+                switch(pool_type)
+                {
+                    case PoolingType::AVG:
+                        _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG>;
+                        break;
+                    case PoolingType::MAX:
+                        _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
+            }
+            else if(input->info()->data_type() == DataType::F16)
+            {
+                switch(pool_type)
+                {
+                    case PoolingType::AVG:
+                        _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG>;
+                        break;
+                    case PoolingType::L2:
+                        _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2>;
+                        break;
+                    case PoolingType::MAX:
+                        _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX>;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
             }
             else if(input->info()->data_type() == DataType::F32)
             {
-                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX>;
+                switch(pool_type)
+                {
+                    case PoolingType::AVG:
+                        _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG>;
+                        break;
+                    case PoolingType::L2:
+                        _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2>;
+                        break;
+                    case PoolingType::MAX:
+                        _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX>;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
+            }
+            break;
+        case 7:
+            switch(pool_type)
+            {
+                case PoolingType::AVG:
+                    _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG>;
+                    break;
+                case PoolingType::L2:
+                    _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2>;
+                    break;
+                case PoolingType::MAX:
+                    _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
             }
             break;
         default:
@@ -207,7 +417,8 @@
     {
         const auto top_data    = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
         const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
-        qint8x8_t  res         = {};
+        qint8x8_t  lower_res   = {};
+        qint8x8_t  upper_res   = {};
         if(pooling_type == PoolingType::AVG)
         {
             // Calculate scale
@@ -216,24 +427,175 @@
 
             // Perform pooling
             const qint8x16_t sum_data = vqaddq_qs8(top_data, bottom_data);
-            res                       = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position);
+            lower_res                 = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position);
+            if(pool_stride_x == 1)
+            {
+                const qint8x16_t sum_data_shifted = vextq_s8(sum_data, sum_data, 1);
+                upper_res                         = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data_shifted), vget_high_s8(sum_data_shifted)), scale_vec, fixed_point_position);
+            }
         }
         else
         {
             const qint8x16_t max_data = vmaxq_s8(top_data, bottom_data);
-            res                       = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data));
+            lower_res                 = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data));
+            if(pool_stride_x == 1)
+            {
+                const qint8x16_t max_data_shifted = vextq_s8(max_data, max_data, 1);
+                upper_res                         = vpmax_s8(vget_low_s8(max_data_shifted), vget_high_s8(max_data_shifted));
+            }
         }
-        vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
+        if(pool_stride_x == 1)
+        {
+            const qint8x8x2_t res = { { lower_res, upper_res } };
+            vst2_s8(reinterpret_cast<qint8_t *>(output.ptr()), res);
+        }
+        else
+        {
+            vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), lower_res);
+        }
     },
     input, output);
 }
 
 template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling2_q16(const Window &window_input, const Window &window)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
 
+    const int     fixed_point_position = _input->info()->fixed_point_position();
+    constexpr int pool_size            = 2;
+    int           pool_pad_x           = 0;
+    int           pool_pad_y           = 0;
+    int           pool_stride_x        = 0;
+    int           pool_stride_y        = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+    const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+    const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto top_data    = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
+        const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
+        qint16x4_t lower_res   = {};
+        qint16x4_t upper_res   = {};
+        if(pooling_type == PoolingType::AVG)
+        {
+            // Calculate scale
+            const qint16_t   scale     = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
+            const qint16x4_t scale_vec = vdup_n_qs16(scale);
+
+            // Perform pooling
+            const qint16x8_t sum_data = vqaddq_qs16(top_data, bottom_data);
+            lower_res                 = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data), vget_high_s16(sum_data)), scale_vec, fixed_point_position);
+            if(pool_stride_x == 1)
+            {
+                const qint16x8_t sum_data_shifted = vextq_s16(sum_data, sum_data, 1);
+                upper_res                         = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data_shifted), vget_high_s16(sum_data_shifted)), scale_vec, fixed_point_position);
+            }
+        }
+        else
+        {
+            const qint16x8_t max_data = vmaxq_s16(top_data, bottom_data);
+            lower_res                 = vpmax_s16(vget_low_s16(max_data), vget_high_s16(max_data));
+            if(pool_stride_x == 1)
+            {
+                const qint16x8_t max_data_shifted = vextq_s16(max_data, max_data, 1);
+                upper_res                         = vpmax_s16(vget_low_s16(max_data_shifted), vget_high_s16(max_data_shifted));
+            }
+        }
+        if(pool_stride_x == 1)
+        {
+            const qint16x4x2_t res = { { lower_res, upper_res } };
+            vst2_s16(reinterpret_cast<qint16_t *>(output.ptr()), res);
+        }
+        else
+        {
+            vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), lower_res);
+        }
+    },
+    input, output);
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window &window)
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    constexpr const int pool_size     = 3;
+    int                 pool_pad_x    = 0;
+    int                 pool_pad_y    = 0;
+    int                 pool_stride_x = 0;
+    int                 pool_stride_y = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+    const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+    const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+    const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        float16x4_t top_data    = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
+        float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
+        float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
+        float16x4_t res         = {};
+
+        // Get power of 2 in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            top_data    = vmul_f16(top_data, top_data);
+            middle_data = vmul_f16(middle_data, middle_data);
+            bottom_data = vmul_f16(bottom_data, bottom_data);
+        }
+
+        if(pooling_type != PoolingType::MAX)
+        {
+            // Calculate scale
+            const float       scale   = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            const float16x4_t scale_v = vdup_n_f16(scale);
+            // Perform pooling
+            const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
+            res                        = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
+            res                        = vmul_f16(vpadd_f16(res, res), scale_v);
+        }
+        else
+        {
+            const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
+            res                        = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
+            res                        = vpmax_f16(res, res);
+        }
+
+        // Calculate square-root in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            res = vinv_f16(vinvsqrt_f16(res));
+        }
+
+        *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
+    },
+    input, output);
+#else  /* ARM_COMPUTE_ENABLE_FP16 */
+    ARM_COMPUTE_UNUSED(window_input);
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window &window)
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+    Iterator      input(_input, window_input);
+    Iterator      output(_output, window);
     constexpr int pool_size = 2;
     int           pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
     std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
@@ -246,10 +608,81 @@
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        const float32x2_t top_data    = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
-        const float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
-        float32x2_t       res         = {};
-        if(pooling_type == PoolingType::AVG)
+        auto        top_data    = vld2q_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
+        auto        bottom_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
+        float16x8_t res         = {};
+
+        // Get power of 2 in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            top_data.val[0]    = vmulq_f16(top_data.val[0], top_data.val[0]);
+            top_data.val[1]    = vmulq_f16(top_data.val[1], top_data.val[1]);
+            bottom_data.val[0] = vmulq_f16(bottom_data.val[0], bottom_data.val[0]);
+            bottom_data.val[1] = vmulq_f16(bottom_data.val[1], bottom_data.val[1]);
+        }
+
+        if(pooling_type != PoolingType::MAX)
+        {
+            const float       scale   = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            const float16x8_t scale_v = vdupq_n_f16(scale);
+            res                       = vmulq_f16(scale_v, vaddq_f16(bottom_data.val[1], vaddq_f16(bottom_data.val[0], vaddq_f16(top_data.val[0], top_data.val[1]))));
+        }
+        else
+        {
+            res = vmaxq_f16(bottom_data.val[1], vmaxq_f16(bottom_data.val[0], vmaxq_f16(top_data.val[0], top_data.val[1])));
+        }
+
+        // Calculate square-root in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            res = vinvq_f16(vinvsqrtq_f16(res));
+        }
+
+        // Store result
+        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
+    },
+    input, output);
+#else  /* ARM_COMPUTE_ENABLE_FP16 */
+    ARM_COMPUTE_UNUSED(window_input);
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    constexpr int pool_size     = 2;
+    int           pool_pad_x    = 0;
+    int           pool_pad_y    = 0;
+    int           pool_stride_x = 0;
+    int           pool_stride_y = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        float32x2_t top_data    = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
+        float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
+        float32x2_t res         = {};
+        float       final_res   = 0;
+
+        // Get power of 2 in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            top_data    = vmul_f32(top_data, top_data);
+            bottom_data = vmul_f32(bottom_data, bottom_data);
+        }
+
+        if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
             float             scale   = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
@@ -264,7 +697,16 @@
             const float32x2_t max_data = vmax_f32(top_data, bottom_data);
             res                        = vpmax_f32(max_data, max_data);
         }
-        *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(res, 0);
+        final_res = vget_lane_f32(res, 0);
+
+        // Calculate square-root in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            final_res = sqrt(final_res);
+        }
+
+        // Store result
+        *(reinterpret_cast<float *>(output.ptr())) = final_res;
     },
     input, output);
 }
@@ -299,8 +741,7 @@
         if(pooling_type == PoolingType::AVG)
         {
             // Calculate scale
-            const qint8_t   scale     = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
-            const qint8x8_t scale_vec = vdup_n_qs8(scale);
+            const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
 
             // Perform pooling for stride 2
             const qint8x16_t sum_data  = vqaddq_qs8(vqaddq_qs8(top_data, bottom_data), middle_data);
@@ -311,13 +752,16 @@
             {
                 const qint8x8x2_t      table      = { { vget_low_s8(final_sum), vget_high_s8(final_sum) } };
                 static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
+                const qint8x8_t        scale_vec  = vdup_n_qs8(scale);
                 res                               = vtbl2_s8(table, lookup_val);
+                res                               = vqmul_qs8(res, scale_vec, fixed_point_position);
+                vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
             }
             else
             {
-                res = vget_low_s8(final_sum);
+                const qint8x16_t scale_vec = vdupq_n_qs8(scale);
+                vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqmulq_qs8(final_sum, scale_vec, fixed_point_position));
             }
-            res = vqmul_qs8(res, scale_vec, fixed_point_position);
         }
         else
         {
@@ -331,25 +775,29 @@
                 const qint8x8x2_t      table      = { { vget_low_s8(final_max), vget_high_s8(final_max) } };
                 static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
                 res                               = vtbl2_s8(table, lookup_val);
+                vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
             }
             else
             {
-                res = vget_low_s8(final_max);
+                vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), final_max);
             }
         }
-        vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
     },
     input, output);
 }
 
 template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling3_q16(const Window &window_input, const Window &window)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
 
-    constexpr const int pool_size = 3;
-    int                 pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
+    const int     fixed_point_position = _input->info()->fixed_point_position();
+    constexpr int pool_size            = 3;
+    int           pool_pad_x           = 0;
+    int           pool_pad_y           = 0;
+    int           pool_stride_x        = 0;
+    int           pool_stride_y        = 0;
     std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
     const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
@@ -361,13 +809,92 @@
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        const float32x4_t top_data    = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
-        const float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
-        const float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
-        float32x2_t       res         = {};
+        const auto top_data    = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
+        const auto middle_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_middle_ptr + input.offset()));
+        const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
+
         if(pooling_type == PoolingType::AVG)
         {
             // Calculate scale
+            const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
+
+            // Perform pooling for stride 2
+            const qint16x8_t sum_data  = vqaddq_qs16(vqaddq_qs16(top_data, bottom_data), middle_data);
+            const qint16x8_t sum_data2 = vextq_s16(sum_data, sum_data, 1);
+            const qint16x8_t sum_data3 = vextq_s16(sum_data, sum_data, 2);
+            const qint16x8_t final_sum = vqaddq_qs16(vqaddq_qs16(sum_data, sum_data2), sum_data3);
+            if(pool_stride_x == 2)
+            {
+                const qint16x4_t tmp       = { vgetq_lane_s16(final_sum, 0), vgetq_lane_s16(final_sum, 2), vgetq_lane_s16(final_sum, 4), vgetq_lane_s16(final_sum, 6) };
+                const qint16x4_t scale_vec = vdup_n_qs16(scale);
+                vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmul_qs16(tmp, scale_vec, fixed_point_position));
+            }
+            else
+            {
+                const qint16x8_t scale_vec = vdupq_n_qs16(scale);
+                vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmulq_qs16(final_sum, scale_vec, fixed_point_position));
+            }
+        }
+        else
+        {
+            const qint16x8_t max_data  = vmaxq_s16(vmaxq_s16(top_data, bottom_data), middle_data);
+            const qint16x8_t max_data2 = vextq_s16(max_data, max_data, 1);
+            const qint16x8_t max_data3 = vextq_s16(max_data, max_data, 2);
+            const qint16x8_t final_max = vmaxq_s16(vmaxq_s16(max_data, max_data2), max_data3);
+
+            if(pool_stride_x == 2)
+            {
+                const qint16x4_t tmp = { vgetq_lane_s16(final_max, 0), vgetq_lane_s16(final_max, 2), vgetq_lane_s16(final_max, 4), vgetq_lane_s16(final_max, 6) };
+                vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), tmp);
+            }
+            else
+            {
+                vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), final_max);
+            }
+        }
+    },
+    input, output);
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    constexpr const int pool_size     = 3;
+    int                 pool_pad_x    = 0;
+    int                 pool_pad_y    = 0;
+    int                 pool_stride_x = 0;
+    int                 pool_stride_y = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+    const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        float32x4_t top_data    = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
+        float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
+        float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
+        float32x2_t res         = {};
+        float       final_res   = 0;
+
+        // Get power of 2 in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            top_data    = vmulq_f32(top_data, top_data);
+            middle_data = vmulq_f32(middle_data, middle_data);
+            bottom_data = vmulq_f32(bottom_data, bottom_data);
+        }
+
+        if(pooling_type != PoolingType::MAX)
+        {
+            // Calculate scale
             float             scale   = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
             const float32x2_t scale_v = vdup_n_f32(scale);
 
@@ -382,30 +909,133 @@
             res                        = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
             res                        = vpmax_f32(res, res);
         }
-        *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(res, 0);
+        final_res = vget_lane_f32(res, 0);
+
+        // Calculate square-root in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            final_res = sqrt(final_res);
+        }
+
+        // Store result
+        *(reinterpret_cast<float *>(output.ptr())) = final_res;
     },
     input, output);
 }
 
-void NEPoolingLayerKernel::run(const Window &window)
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window &window)
 {
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    constexpr const int pool_size     = 7;
+    int                 pool_pad_x    = 0;
+    int                 pool_pad_y    = 0;
+    int                 pool_stride_x = 0;
+    int                 pool_stride_y = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+    std::array<const uint8_t *, pool_size> input_ptrs{ {} };
+    for(int i = 0; i < pool_size; ++i)
+    {
+        input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + i));
+    }
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        float32x2_t res       = {};
+        float       final_res = 0.f;
+        if(pooling_type != PoolingType::MAX)
+        {
+            // Calculate scale
+            float             scale   = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            const float32x2_t scale_v = vdup_n_f32(scale);
+
+            // Perform pooling
+            float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
+            // Get power of 2 in case of l2 pooling
+            if(pooling_type == PoolingType::L2)
+            {
+                data.val[0] = vmulq_f32(data.val[0], data.val[0]);
+                data.val[1] = vmulq_f32(data.val[1], data.val[1]);
+            }
+            float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
+            for(int i = 1; i < pool_size; ++i)
+            {
+                data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
+                // Get power of 2 in case of l2 pooling
+                if(pooling_type == PoolingType::L2)
+                {
+                    data.val[0] = vmulq_f32(data.val[0], data.val[0]);
+                    data.val[1] = vmulq_f32(data.val[1], data.val[1]);
+                }
+                sum_data = vaddq_f32(sum_data, data.val[0]);
+                sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
+            }
+            res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
+            res = vmul_f32(vpadd_f32(res, res), scale_v);
+        }
+        else
+        {
+            float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
+            for(int i = 1; i < pool_size; ++i)
+            {
+                const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
+                max_data                 = vmax2q_f32(max_data, data);
+            }
+            res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
+            res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
+            res = vpmax_f32(res, res);
+        }
+        final_res = vget_lane_f32(res, 0);
+
+        // Calculate square-root in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            final_res = sqrt(final_res);
+        }
+
+        // Store result
+        *(reinterpret_cast<float *>(output.ptr())) = final_res;
+    },
+    input, output);
+}
+
+void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    unsigned int pool_stride_x, pool_stride_y = 0;
-    std::tie(pool_stride_x, pool_stride_y)    = _pool_info.pad_stride_info().stride();
+    const unsigned int pool_stride_x = _pool_info.pad_stride_info().stride().first;
+    const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second;
 
     // Set step for input in x and y direction for the input
     Window       window_input(window);
     unsigned int window_x_inc = 0;
-    if(_input->info()->data_type() == DataType::QS8)
+    switch(_input->info()->data_type())
     {
-        window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
-    }
-    else
-    {
-        window_x_inc = pool_stride_x;
+        case DataType::QS8:
+        case DataType::QS16:
+        case DataType::F16:
+        {
+            window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
+            break;
+        }
+        case DataType::F32:
+        {
+            window_x_inc = pool_stride_x;
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Not supported");
+        }
     }
     window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
     window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));

diff --git a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
new file mode 100644
index 0000000..bff79f0
--- /dev/null
+++ b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp

@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+NEQuantizationLayerKernel::NEQuantizationLayerKernel()
+    : _input(nullptr), _output(nullptr), _min_max(nullptr)
+{
+}
+
+void NEQuantizationLayerKernel::configure(const ITensor *input, ITensor *output, const ITensor *min_max)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, DataType::U8, 0);
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+    _input   = input;
+    _output  = output;
+    _min_max = min_max;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+
+    // Configure window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     min_max_access(min_max->info(), 0, 0, 2, min_max->info()->dimension(1));
+
+    // Update window and padding
+    update_window_and_padding(win, input_access, output_access, min_max_access);
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEQuantizationLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Window window_input_output(window);
+    window_input_output.collapse_if_possible(INEKernel::window(), 3);
+    window_input_output.set(3, Window::Dimension(0, 1, 1));
+
+    Window window_min_max;
+    window_min_max.use_tensor_dimensions(_min_max->info()->tensor_shape());
+    window_min_max.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_min_max.collapse_if_possible(INEKernel::window(), 1);
+
+    Iterator input(_input, window_input_output);
+    Iterator output(_output, window_input_output);
+    Iterator min_max(_min_max, window_min_max);
+
+    execute_window_loop(window_min_max, [&](const Coordinates & id_batch)
+    {
+        // Get the min and max
+        float min = *(reinterpret_cast<const float *>(min_max.ptr()) + 0);
+        float max = *(reinterpret_cast<const float *>(min_max.ptr()) + 1);
+
+        // Saturate the result if min = max
+        if(min == max)
+        {
+            min = 0.0f;
+            max = 1.0f;
+        }
+
+        const float32x4_t vmin             = vdupq_n_f32(min);
+        const float32x4_t inv_range        = vdupq_n_f32(1.0f / (max - min));
+        const float32x4_t quantization_max = vdupq_n_f32(255.0f);
+        const float32x4_t quantization_mul = vdupq_n_f32(256.0f);
+
+        // Uniformly map values to range 8bit integers, i.e. [min, max] -> [0, 255]
+        execute_window_loop(window_input_output, [&](const Coordinates & id)
+        {
+            // Get the input values
+            const auto    input_ptr = reinterpret_cast<const float *>(input.ptr() + id_batch[1] * _input->info()->strides_in_bytes()[3]);
+            float32x4x2_t val       = vld2q_f32(input_ptr);
+
+            // Map float values to range [0.0, 1.0]
+            val.val[0] = vsubq_f32(val.val[0], vmin);
+            val.val[1] = vsubq_f32(val.val[1], vmin);
+            val.val[0] = vmulq_f32(val.val[0], inv_range);
+            val.val[1] = vmulq_f32(val.val[1], inv_range);
+
+            // Quantize
+            val.val[0] = vmulq_f32(val.val[0], quantization_mul);
+            val.val[1] = vmulq_f32(val.val[1], quantization_mul);
+            val.val[0] = vminq_f32(val.val[0], quantization_max);
+            val.val[1] = vminq_f32(val.val[1], quantization_max);
+
+            const uint32x4_t   val_u32_low  = vcvtq_u32_f32(val.val[0]);
+            const uint32x4_t   val_u32_high = vcvtq_u32_f32(val.val[1]);
+            const uint16x4x2_t val_u16      = vzip_u16(vmovn_u32(val_u32_low), vmovn_u32(val_u32_high));
+
+            const uint8x8_t quantized = vmovn_u16(vcombine_u16(val_u16.val[0], val_u16.val[1]));
+
+            // Store the quantized values
+            auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr() + id_batch[1] * _output->info()->strides_in_bytes()[3]);
+            vst1_u8(output_ptr, quantized);
+        },
+        input, output);
+    },
+    min_max);
+}

diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
new file mode 100644
index 0000000..a209a52
--- /dev/null
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp

@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+#include <cfloat>
+#include <cmath>
+
+using namespace arm_compute;
+
+NEROIPoolingLayerKernel::NEROIPoolingLayerKernel()
+    : _input(nullptr), _rois(nullptr), _output(nullptr), _pool_info(0, 0, 0.f)
+{
+}
+
+void NEROIPoolingLayerKernel::configure(const ITensor *input, const IROIArray *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, rois, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
+    ARM_COMPUTE_ERROR_ON(rois->num_values() == 0);
+
+    // Output auto inizialitation if not yet initialized
+    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->num_values());
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
+
+    // Set instance variables
+    _input     = input;
+    _rois      = rois;
+    _output    = output;
+    _pool_info = pool_info;
+
+    // Configure kernel window
+    Window window;
+    window.set(Window::DimX, Window::Dimension(0, rois->num_values()));
+    window.set(Window::DimY, Window::Dimension(0, 1));
+
+    AccessWindowStatic input_access(input->info(),
+                                    input->info()->valid_region().start(0),
+                                    input->info()->valid_region().start(1),
+                                    input->info()->valid_region().end(0),
+                                    input->info()->valid_region().end(1));
+    AccessWindowStatic output_access(output->info(), 0, 0, pool_info.pooled_width(), pool_info.pooled_height());
+
+    update_window_and_padding(window, input_access, output_access);
+    output_access.set_valid_region(window, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    INEKernel::configure(window);
+}
+
+void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    const int   roi_list_start = window.x().start();
+    const int   roi_list_end   = window.x().end();
+    const int   width          = _input->info()->dimension(Window::DimX);
+    const int   height         = _input->info()->dimension(Window::DimY);
+    const int   fms            = _input->info()->dimension(Window::DimZ);
+    const int   pooled_w       = _pool_info.pooled_width();
+    const int   pooled_h       = _pool_info.pooled_height();
+    const float spatial_scale  = _pool_info.spatial_scale();
+
+    for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
+    {
+        const ROI &curr_roi = _rois->at(roi_indx);
+
+        // Scale ROI
+        const int roi_batch    = curr_roi.batch_idx;
+        const int roi_anchor_x = support::cpp11::round(curr_roi.rect.x * spatial_scale);
+        const int roi_anchor_y = support::cpp11::round(curr_roi.rect.y * spatial_scale);
+        const int roi_width    = std::max(support::cpp11::round(curr_roi.rect.width * spatial_scale), 1.f);
+        const int roi_height   = std::max(support::cpp11::round(curr_roi.rect.height * spatial_scale), 1.f);
+
+        // Iterate through all feature maps
+        for(int fm = 0; fm < fms; ++fm)
+        {
+            // Iterate through all output pixels
+            for(int py = 0; py < pooled_h; ++py)
+            {
+                for(int px = 0; px < pooled_w; ++px)
+                {
+                    auto region_start_x = static_cast<int>(std::floor((static_cast<float>(px) / pooled_w) * roi_width));
+                    auto region_end_x   = static_cast<int>(std::floor((static_cast<float>(px + 1) / pooled_w) * roi_width));
+                    auto region_start_y = static_cast<int>(std::floor((static_cast<float>(py) / pooled_h) * roi_height));
+                    auto region_end_y   = static_cast<int>(std::floor((static_cast<float>(py + 1) / pooled_h) * roi_height));
+
+                    region_start_x = std::min(std::max(region_start_x + roi_anchor_x, 0), width);
+                    region_end_x   = std::min(std::max(region_end_x + roi_anchor_x, 0), width);
+                    region_start_y = std::min(std::max(region_start_y + roi_anchor_y, 0), height);
+                    region_end_y   = std::min(std::max(region_end_y + roi_anchor_y, 0), height);
+
+                    // Iterate through the pooling region
+                    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+                    {
+                        *reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = 0;
+                    }
+                    else
+                    {
+                        float curr_max = -FLT_MAX;
+                        for(int j = region_start_y; j < region_end_y; ++j)
+                        {
+                            for(int i = region_start_x; i < region_end_x; ++i)
+                            {
+                                const auto val = *reinterpret_cast<const float *>(_input->ptr_to_element(Coordinates(i, j, fm, roi_batch)));
+                                curr_max       = std::max(val, curr_max);
+                            }
+                        }
+                        *reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = curr_max;
+                    }
+                }
+            }
+        }
+    }
+}

diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
new file mode 100644
index 0000000..1a50ed8
--- /dev/null
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp

@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEReductionOperationKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace
+{
+template <class F>
+class Reducer
+{
+public:
+    static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f)
+    {
+        // Set out window
+        Window out_window(window);
+        out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+        // Get first input and output slices
+        Window in_slice  = window.first_slice_window_1D();
+        Window out_slice = out_window.first_slice_window_1D();
+
+        do
+        {
+            Iterator in(input, in_slice);
+            Iterator out(output, out_slice);
+
+            f(in, out, in_slice, out_slice);
+        }
+        while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(out_slice));
+    }
+};
+
+struct SumsqOpX
+{
+    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice)
+    {
+        ARM_COMPUTE_UNUSED(out_slice);
+        float32x4_t vec_sum_value = vdupq_n_f32(0.f);
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto        in_ptr       = reinterpret_cast<const float *>(input.ptr());
+            const float32x4_t vec_elements = vld1q_f32(in_ptr);
+            vec_sum_value                  = vaddq_f32(vmulq_f32(vec_elements, vec_elements), vec_sum_value);
+        },
+        input);
+
+        float32x2_t carry_addition = vpadd_f32(vget_high_f32(vec_sum_value), vget_low_f32(vec_sum_value));
+        carry_addition             = vpadd_f32(carry_addition, carry_addition);
+
+        *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(carry_addition, 0);
+    }
+};
+
+void reduce_sumsq(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
+{
+    switch(axis)
+    {
+        case 0:
+            return Reducer<SumsqOpX>::reduceX(window, input, output, SumsqOpX());
+        default:
+            ARM_COMPUTE_ERROR("Unsupported reduction axis");
+    }
+}
+} // namespace
+
+NEReductionOperationKernel::NEReductionOperationKernel()
+    : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::SUM_SQUARE), _border_size()
+{
+}
+
+BorderSize NEReductionOperationKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
+
+    // Calculate output shape and set if empty
+    TensorShape output_shape{ input->info()->tensor_shape() };
+    output_shape.set(axis, 1);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+    unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type());
+
+    _input       = input;
+    _output      = output;
+    _border_size = (axis == 0) ? BorderSize(0, num_elems_processed_per_iteration - (input->info()->dimension(0) % num_elems_processed_per_iteration), 0, 0) : BorderSize();
+    _op          = op;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEReductionOperationKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    switch(_op)
+    {
+        case ReductionOperation::SUM_SQUARE:
+            reduce_sumsq(window, _input, _output, _reduction_axis);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported reduction operation.");
+    }
+}

diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
index c3c44a5..83004ae 100644
--- a/src/core/NEON/kernels/NERemapKernel.cpp
+++ b/src/core/NEON/kernels/NERemapKernel.cpp

@@ -192,32 +192,33 @@
         const uint8_t *in_ptr   = in.ptr();
 
         uint8x8_t tmp0 = vdup_n_u8(0);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[0], mapy_ptr[0]), tmp0, 0);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[1], mapy_ptr[1]), tmp0, 1);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[2], mapy_ptr[2]), tmp0, 2);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[3], mapy_ptr[3]), tmp0, 3);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[4], mapy_ptr[4]), tmp0, 4);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[5], mapy_ptr[5]), tmp0, 5);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[6], mapy_ptr[6]), tmp0, 6);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[7], mapy_ptr[7]), tmp0, 7);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[0], mapy_ptr[0]), tmp0, 0);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[1], mapy_ptr[1]), tmp0, 1);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[2], mapy_ptr[2]), tmp0, 2);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[3], mapy_ptr[3]), tmp0, 3);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[4], mapy_ptr[4]), tmp0, 4);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[5], mapy_ptr[5]), tmp0, 5);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[6], mapy_ptr[6]), tmp0, 6);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[7], mapy_ptr[7]), tmp0, 7);
 
         uint8x8_t tmp1 = vdup_n_u8(0);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[8], mapy_ptr[8]), tmp1, 0);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[9], mapy_ptr[9]), tmp1, 1);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[10], mapy_ptr[10]), tmp1, 2);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[11], mapy_ptr[11]), tmp1, 3);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[12], mapy_ptr[12]), tmp1, 4);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[13], mapy_ptr[13]), tmp1, 5);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[14], mapy_ptr[14]), tmp1, 6);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[15], mapy_ptr[15]), tmp1, 7);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[8], mapy_ptr[8]), tmp1, 0);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[9], mapy_ptr[9]), tmp1, 1);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[10], mapy_ptr[10]), tmp1, 2);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[11], mapy_ptr[11]), tmp1, 3);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[12], mapy_ptr[12]), tmp1, 4);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[13], mapy_ptr[13]), tmp1, 5);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[14], mapy_ptr[14]), tmp1, 6);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[15], mapy_ptr[15]), tmp1, 7);
 
         vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
     },
     in, out, mapx, mapy);
 }
 
-void NERemapKernel::run(const Window &window)
+void NERemapKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);

diff --git a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
new file mode 100644
index 0000000..8e69252
--- /dev/null
+++ b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp

@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace
+{
+template <typename T>
+inline void reshape_tensor(const Window &window, const ITensor *input, ITensor *output)
+{
+    const TensorShape &input_shape  = input->info()->tensor_shape();
+    const TensorShape &output_shape = output->info()->tensor_shape();
+    Coordinates        output_coord{};
+
+    window.collapse_if_possible(window, 3);
+    Iterator in(input, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        output_coord                                                 = index2coords(output_shape, coords2index(input_shape, id));
+        *reinterpret_cast<T *>(output->ptr_to_element(output_coord)) = *reinterpret_cast<T *>(in.ptr());
+    },
+    in);
+}
+} // namespace
+
+void NEReshapeLayerKernel::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16,
+                                                  DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size());
+
+    _input  = input;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     output_access(output->info(), 0, 0, output->info()->tensor_shape().x(), output->info()->tensor_shape().y());
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEReshapeLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    switch(_input->info()->data_type())
+    {
+        case DataType::U8:
+        case DataType::S8:
+        case DataType::QS8:
+            reshape_tensor<uint8_t>(window, _input, _output);
+            break;
+        case DataType::U16:
+        case DataType::S16:
+        case DataType::QS16:
+        case DataType::F16:
+            reshape_tensor<uint16_t>(window, _input, _output);
+            break;
+        case DataType::U32:
+        case DataType::S32:
+        case DataType::F32:
+            reshape_tensor<uint32_t>(window, _input, _output);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type!");
+    }
+}

diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index fd2978d..6634d4b 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp

@@ -50,8 +50,10 @@
 
 void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output, InterpolationPolicy policy, bool border_undefined)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(output == input);
 
     if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
@@ -79,6 +81,16 @@
     _dx      = dx;
     _dy      = dy;
 
+    /* Compute the ratio between source width/height and destination width/height */
+    const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
+    const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
+
+    /* Area interpolation behaves as Nearest Neighbour in case of up-sampling */
+    if(policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+    {
+        policy = InterpolationPolicy::NEAREST_NEIGHBOR;
+    }
+
     switch(policy)
     {
         case InterpolationPolicy::NEAREST_NEIGHBOR:
@@ -104,13 +116,18 @@
     }
 
     constexpr unsigned int num_elems_processed_per_iteration = 16;
-    const int              border_offset                     = (border_undefined) ? 0 : border_size().left;
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
 
-    AccessWindowStatic     input_access(input->info(), -border_offset, -border_offset, input->info()->dimension(0) + border_offset, input->info()->dimension(1) + border_offset);
-    AccessWindowHorizontal offsets_access(offsets->info(), 0, num_elems_processed_per_iteration);
+    const ValidRegion &input_valid_region = input->info()->valid_region();
+
+    // Reads can occur within the valid region of the input
+    AccessWindowStatic input_access(input->info(),
+                                    input_valid_region.anchor[0] - border_size().left, input_valid_region.anchor[1] - border_size().top,
+                                    input_valid_region.anchor[0] + input_valid_region.shape[0] + border_size().right,
+                                    input_valid_region.anchor[1] + input_valid_region.shape[1] + border_size().bottom);
+    AccessWindowHorizontal offsets_access(offsets == nullptr ? nullptr : offsets->info(), 0, num_elems_processed_per_iteration);
     AccessWindowHorizontal dx_access(dx == nullptr ? nullptr : dx->info(), 0, num_elems_processed_per_iteration);
     AccessWindowHorizontal dy_access(dy == nullptr ? nullptr : dy->info(), 0, num_elems_processed_per_iteration);
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
@@ -122,8 +139,7 @@
                               dy_access,
                               output_access);
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
+    output_access.set_valid_region(win, calculate_valid_region_scale(*(input->info()), output->info()->tensor_shape(), policy, border_size(), border_undefined));
     INEKernel::configure(win);
 }
 
@@ -164,8 +180,8 @@
                 const auto           offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
                 const uint8_t *const in_ptr      = in.ptr();
 
-                const size_t in_yi      = (id.y() + 0.5f) * hr;
-                const size_t offset_row = in_yi * input_stride;
+                const int in_yi      = std::floor((id.y() + 0.5f) * hr);
+                const int offset_row = in_yi * input_stride;
 
                 tmp = vsetq_lane_u8(in_ptr[offsets_ptr[0] + offset_row], tmp, 0);
                 tmp = vsetq_lane_u8(in_ptr[offsets_ptr[1] + offset_row], tmp, 1);
@@ -203,8 +219,8 @@
             {
                 const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
 
-                const size_t in_yi      = (id.y() + 0.5f) * hr;
-                const size_t offset_row = in_yi * input_stride;
+                const int in_yi      = (id.y() + 0.5f) * hr;
+                const int offset_row = in_yi * input_stride;
 
                 tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
                 tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[0], 1);
@@ -229,6 +245,50 @@
             in, offsets, out);
             break;
         }
+        case DataType::F32:
+        {
+            float32x4x4_t tmp =
+            {
+                {
+                    vdupq_n_f32(0),
+                    vdupq_n_f32(0),
+                    vdupq_n_f32(0),
+                    vdupq_n_f32(0)
+                }
+            };
+
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+
+                const int in_yi      = (id.y() + 0.5f) * hr;
+                const int offset_row = in_yi * input_stride;
+
+                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
+                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 1);
+                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 2);
+                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 3);
+
+                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0);
+                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 1);
+                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 2);
+                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 3);
+
+                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[2], 0);
+                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[2], 1);
+                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[2], 2);
+                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[2], 3);
+
+                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[3], 0);
+                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[3], 1);
+                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[3], 2);
+                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[3], 3);
+
+                vst4q_f32(reinterpret_cast<float *>(out.ptr()), tmp);
+            },
+            in, offsets, out);
+            break;
+        }
         default:
             ARM_COMPUTE_ERROR("Not supported");
             break;
@@ -237,7 +297,7 @@
 
 void NEScaleKernel::scale_bilinear(const Window &window)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8, DataType::S16, DataType::F32);
 
     // Compute the ratio between source height and destination height
     const auto hr = static_cast<float>(_input->info()->dimension(1)) / static_cast<float>(_output->info()->dimension(1));
@@ -264,41 +324,140 @@
     Iterator dy(_dy, win_off);
 
     /* Input image stride */
-    const size_t in_stride = _input->info()->strides_in_bytes()[1];
+    const size_t in_stide_in_bytes = _input->info()->strides_in_bytes()[1];
+    const size_t in_stride         = in_stide_in_bytes / _input->info()->element_size();
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    switch(_input->info()->data_type())
     {
-        const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-        const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
-        const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
-        const auto in_ptr      = reinterpret_cast<const uint8_t *>(in.ptr());
+        case DataType::U8:
+        {
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+                const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
+                const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
+                const auto in_ptr      = reinterpret_cast<const uint8_t *>(in.ptr());
 
-        const size_t in_yi      = std::floor((id.y() + 0.5f) * hr - 0.5f);
-        const size_t offset_row = in_yi * in_stride;
+                const int in_yi      = std::floor((id.y() + 0.5f) * hr - 0.5f);
+                const int offset_row = in_yi * in_stide_in_bytes;
 
-        uint8x8_t tmp0 = vdup_n_u8(0);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7);
+                uint8x8_t tmp0 = vdup_n_u8(0);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7);
 
-        uint8x8_t tmp1 = vdup_n_u8(0);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7);
+                uint8x8_t tmp1 = vdup_n_u8(0);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7);
 
-        vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
-    },
-    in, offsets, dx, dy, out);
+                vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
+            },
+            in, offsets, dx, dy, out);
+            break;
+        }
+        case DataType::S16:
+        {
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+                const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
+                const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
+
+                const int in_yi      = std::floor((id.y() + 0.5f) * hr - 0.5f);
+                const int offset_row = in_yi * in_stide_in_bytes;
+
+                int16x8x2_t tmp =
+                {
+                    {
+                        vdupq_n_s16(0),
+                        vdupq_n_s16(0)
+                    }
+                };
+
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0);
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[0], 1);
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 2);
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[0], 3);
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 4);
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[0], 5);
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 6);
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[0], 7);
+
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0);
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[1], 1);
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 2);
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[1], 3);
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 4);
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[1], 5);
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 6);
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[1], 7);
+
+                vst2q_s16(reinterpret_cast<int16_t *>(out.ptr()), tmp);
+            },
+            in, offsets, dx, dy, out);
+            break;
+        }
+        case DataType::F32:
+        {
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+                const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
+                const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
+
+                const int in_yi      = std::floor((id.y() + 0.5f) * hr - 0.5f);
+                const int offset_row = in_yi * in_stide_in_bytes;
+
+                float32x4x4_t tmp =
+                {
+                    {
+                        vdupq_n_f32(0),
+                        vdupq_n_f32(0),
+                        vdupq_n_f32(0),
+                        vdupq_n_f32(0)
+                    }
+                };
+
+                tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0);
+                tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 1);
+                tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 2);
+                tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 3);
+
+                tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0);
+                tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 1);
+                tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 2);
+                tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 3);
+
+                tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[2], 0);
+                tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[2], 1);
+                tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[2], 2);
+                tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[2], 3);
+
+                tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[3], 0);
+                tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[3], 1);
+                tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[3], 2);
+                tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[3], 3);
+
+                vst4q_f32(reinterpret_cast<float *>(out.ptr()), tmp);
+            },
+            in, offsets, dx, dy, out);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
 }
 
 void NEScaleKernel::scale_area(const Window &window)
@@ -349,8 +508,9 @@
     in, out);
 }
 
-void NEScaleKernel::run(const Window &window)
+void NEScaleKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);

diff --git a/src/core/NEON/kernels/NEScharr3x3Kernel.cpp b/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
index 183df1e..f23c31b 100644
--- a/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEScharr3x3Kernel.cpp

@@ -135,8 +135,9 @@
     return BorderSize(1);
 }
 
-void NEScharr3x3Kernel::run(const Window &window)
+void NEScharr3x3Kernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/NESobel3x3Kernel.cpp b/src/core/NEON/kernels/NESobel3x3Kernel.cpp
index ab08a1c..5a80630 100644
--- a/src/core/NEON/kernels/NESobel3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel3x3Kernel.cpp

@@ -88,8 +88,9 @@
     INEKernel::configure(win);
 }
 
-void NESobel3x3Kernel::run(const Window &window)
+void NESobel3x3Kernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/NESobel5x5Kernel.cpp b/src/core/NEON/kernels/NESobel5x5Kernel.cpp
index 488eee1..30e7817 100644
--- a/src/core/NEON/kernels/NESobel5x5Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel5x5Kernel.cpp

@@ -90,8 +90,9 @@
     INEKernel::configure(win);
 }
 
-void NESobel5x5HorKernel::run(const Window &window)
+void NESobel5x5HorKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
@@ -261,8 +262,9 @@
     INEKernel::configure(win);
 }
 
-void NESobel5x5VertKernel::run(const Window &window)
+void NESobel5x5VertKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/NESobel7x7Kernel.cpp b/src/core/NEON/kernels/NESobel7x7Kernel.cpp
index 9761942..4cc80f8 100644
--- a/src/core/NEON/kernels/NESobel7x7Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel7x7Kernel.cpp

@@ -193,8 +193,9 @@
     INEKernel::configure(win);
 }
 
-void NESobel7x7HorKernel::run(const Window &window)
+void NESobel7x7HorKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
@@ -351,8 +352,9 @@
     INEKernel::configure(win);
 }
 
-void NESobel7x7VertKernel::run(const Window &window)
+void NESobel7x7VertKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
index 942662e..648dac4 100644
--- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp

@@ -26,7 +26,6 @@
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/NEON/NEMath.h"
@@ -43,6 +42,104 @@
 
 namespace
 {
+void logits_1d_max_qs8(const ITensor *in, ITensor *out, const Window &window)
+{
+    Window in_slice = window.first_slice_window_1D();
+
+    Window window_max(window);
+    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+    Window max_slice = window_max.first_slice_window_1D();
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator output(out, max_slice);
+
+        qint8x16_t vec_max = vdupq_n_s8(std::numeric_limits<qint8_t>::lowest());
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto       in_ptr        = reinterpret_cast<const qint8_t *>(input.ptr());
+            const qint8x16_t current_value = vld1q_qs8(in_ptr);
+            vec_max                        = vmaxq_qs8(vec_max, current_value);
+        },
+        input);
+
+        qint8x8_t carry_max = vpmax_qs8(vget_high_s8(vec_max), vget_low_s8(vec_max));
+        carry_max           = vpmax_qs8(carry_max, carry_max);
+        carry_max           = vpmax_qs8(carry_max, carry_max);
+        carry_max           = vpmax_qs8(carry_max, carry_max);
+
+        *(reinterpret_cast<qint8_t *>(output.ptr())) = vget_lane_s8(carry_max, 0);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+void logits_1d_max_qs16(const ITensor *in, ITensor *out, const Window &window)
+{
+    Window in_slice = window.first_slice_window_1D();
+
+    Window window_max(window);
+    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+    Window max_slice = window_max.first_slice_window_1D();
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator output(out, max_slice);
+
+        qint16x8_t vec_max = vdupq_n_qs16(std::numeric_limits<qint16_t>::lowest());
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto       in_ptr        = reinterpret_cast<const qint16_t *>(input.ptr());
+            const qint16x8_t current_value = vld1q_qs16(in_ptr);
+            vec_max                        = vmaxq_qs16(vec_max, current_value);
+        },
+        input);
+
+        qint16x4_t carry_max = vpmax_qs16(vget_high_qs16(vec_max), vget_low_qs16(vec_max));
+        carry_max            = vpmax_qs16(carry_max, carry_max);
+        carry_max            = vpmax_qs16(carry_max, carry_max);
+
+        *(reinterpret_cast<qint16_t *>(output.ptr())) = vget_lane_s16(carry_max, 0);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+void logits_1d_max_f16(const ITensor *in, ITensor *out, const Window &window)
+{
+    Window in_slice = window.first_slice_window_1D();
+
+    Window window_max(window);
+    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+    Window max_slice = window_max.first_slice_window_1D();
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator output(out, max_slice);
+
+        float16x8_t vec_max = vdupq_n_f16(std::numeric_limits<float16_t>::lowest());
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto        in_ptr        = reinterpret_cast<const float16_t *>(input.ptr());
+            const float16x8_t current_value = vld1q_f16(in_ptr);
+            vec_max                         = vmaxq_f16(vec_max, current_value);
+        },
+        input);
+
+        float16x4_t carry_max = vpmax_f16(vget_high_f16(vec_max), vget_low_f16(vec_max));
+        carry_max             = vpmax_f16(carry_max, carry_max);
+        carry_max             = vpmax_f16(carry_max, carry_max);
+
+        *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(carry_max, 0);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
 void logits_1d_max_f32(const ITensor *in, ITensor *out, const Window &window)
 {
     Window in_slice = window.first_slice_window_1D();
@@ -73,39 +170,6 @@
     }
     while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
 }
-
-void logits_1d_max_qs8(const ITensor *in, ITensor *out, const Window &window)
-{
-    Window in_slice = window.first_slice_window_1D();
-
-    Window window_max(window);
-    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
-    Window max_slice = window_max.first_slice_window_1D();
-
-    do
-    {
-        Iterator input(in, in_slice);
-        Iterator output(out, max_slice);
-
-        qint8x16_t vec_max = vdupq_n_s8(-1);
-
-        execute_window_loop(in_slice, [&](const Coordinates & id)
-        {
-            const auto       in_ptr        = reinterpret_cast<const qint8_t *>(input.ptr());
-            const qint8x16_t current_value = vld1q_qs8(in_ptr);
-            vec_max                        = vmaxq_qs8(vec_max, current_value);
-        },
-        input);
-
-        qint8x8_t carry_max = vpmax_qs8(vget_high_s8(vec_max), vget_low_s8(vec_max));
-        carry_max           = vpmax_qs8(carry_max, carry_max);
-        carry_max           = vpmax_qs8(carry_max, carry_max);
-        carry_max           = vpmax_qs8(carry_max, carry_max);
-
-        *(reinterpret_cast<int8_t *>(output.ptr())) = vget_lane_s8(carry_max, 0);
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
-}
 } // namespace
 
 NELogits1DMaxKernel::NELogits1DMaxKernel()
@@ -120,30 +184,46 @@
 
 void NELogits1DMaxKernel::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    // Softmax across the x dimension
+    TensorShape output_shape{ input->info()->tensor_shape() };
+    output_shape.set(0, 1);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
 
     const int    input_width                       = input->info()->valid_region().shape.x();
-    unsigned int num_elems_processed_per_iteration = 0;
+    unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type());
 
     switch(input->info()->data_type())
     {
         case DataType::QS8:
-            _func                             = &logits_1d_max_qs8;
-            num_elems_processed_per_iteration = 16;
+            _func = &logits_1d_max_qs8;
+            break;
+        case DataType::QS16:
+            _func = &logits_1d_max_qs16;
             break;
         case DataType::F32:
-            num_elems_processed_per_iteration = 4;
-            _func                             = &logits_1d_max_f32;
+            _func = &logits_1d_max_f32;
             break;
+        case DataType::F16:
+#ifdef ARM_COMPUTE_ENABLE_FP16
+            _func = &logits_1d_max_f16;
+            break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
         default:
             ARM_COMPUTE_ERROR("Unsupported data type.");
     }
 
     _input       = input;
     _output      = output;
-    _border_size = BorderSize(0, input_width % num_elems_processed_per_iteration, 0, 0);
+    _border_size = BorderSize(0, num_elems_processed_per_iteration - (input_width % num_elems_processed_per_iteration), 0, 0);
 
     // Configure kernel window
     constexpr unsigned int num_elems_written_per_row = 1;
@@ -159,8 +239,9 @@
     INEKernel::configure(win);
 }
 
-void NELogits1DMaxKernel::run(const Window &window)
+void NELogits1DMaxKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
@@ -170,67 +251,6 @@
 
 namespace
 {
-void logits_1d_shift_exp_sum_f32(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window)
-{
-    Window window_max(window);
-    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-    Window max_slice = window_max.first_slice_window_1D();
-    Window in_slice  = window.first_slice_window_1D();
-
-    constexpr int step        = 4;
-    const int     long_steps  = in->info()->valid_region().shape.x() / step;
-    const int     small_steps = in->info()->valid_region().shape.x() % step;
-
-    do
-    {
-        Iterator input(in, in_slice);
-        Iterator exp(out, in_slice);
-        Iterator _max(max, max_slice);
-        Iterator _sum(sum, max_slice);
-
-        // Get pointers
-        auto in_ptr  = reinterpret_cast<const float *>(input.ptr());
-        auto exp_ptr = reinterpret_cast<float *>(exp.ptr());
-
-        // Init sum to zero
-        float32x4_t vec_sum_value = vdupq_n_f32(0.0f);
-
-        // Get max value
-        const auto        max_ptr = reinterpret_cast<const float *>(_max.ptr());
-        const float32x4_t vec_max = vdupq_n_f32(*max_ptr);
-
-        // Run neon loop
-        for(int i = 0; i < long_steps; ++i)
-        {
-            float32x4_t vec_elements = vld1q_f32(in_ptr);
-            vec_elements             = vsubq_f32(vec_elements, vec_max);
-            vec_elements             = vexpq_f32(vec_elements);
-
-            vst1q_f32(exp_ptr, vec_elements);
-            vec_sum_value = vaddq_f32(vec_elements, vec_sum_value);
-
-            in_ptr += step;
-            exp_ptr += step;
-        }
-
-        // Reduce sum
-        float32x2_t carry_addition = vpadd_f32(vget_high_f32(vec_sum_value), vget_low_f32(vec_sum_value));
-        carry_addition             = vpadd_f32(carry_addition, carry_addition);
-        float sum                  = vget_lane_f32(carry_addition, 0);
-
-        // Run remaining elements
-        for(int i = 0; i < small_steps; ++i)
-        {
-            float element = std::exp(in_ptr[i] - *max_ptr);
-            exp_ptr[i]    = element;
-            sum += element;
-        }
-
-        *(reinterpret_cast<float *>(_sum.ptr())) = sum;
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
-}
 void logits_1d_shift_exp_sum_qs8(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window)
 {
     Window window_max(window);
@@ -293,6 +313,190 @@
     }
     while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
 }
+void logits_1d_shift_exp_sum_qs16(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window)
+{
+    Window window_max(window);
+    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    Window max_slice = window_max.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_1D();
+
+    constexpr int step                 = 4;
+    const int     long_steps           = in->info()->valid_region().shape.x() / step;
+    const int     small_steps          = in->info()->valid_region().shape.x() % step;
+    const int     fixed_point_position = in->info()->fixed_point_position();
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator exp(out, in_slice);
+        Iterator _max(max, max_slice);
+        Iterator _sum(sum, max_slice);
+
+        // Get pointers
+        auto in_ptr  = reinterpret_cast<const qint16_t *>(input.ptr());
+        auto exp_ptr = reinterpret_cast<qint16_t *>(exp.ptr());
+
+        // Init sum to zero
+        qint32x4_t vec_sum_value = vdupq_n_qs32(0);
+
+        // Get max value
+        const auto       max_ptr = reinterpret_cast<const qint16_t *>(_max.ptr());
+        const qint16x4_t vec_max = vdup_n_qs16(*max_ptr);
+
+        // Run neon loop
+        for(int i = 0; i < long_steps; ++i)
+        {
+            qint16x4_t vec_elements = vld1_qs16(in_ptr);
+            vec_elements            = vqsub_qs16(vec_elements, vec_max);
+            vec_elements            = vqexp_qs16(vec_elements, fixed_point_position);
+
+            vst1_qs16(exp_ptr, vec_elements);
+            vec_sum_value = vqaddq_qs32(vec_sum_value, vmovl_s16(vec_elements));
+
+            in_ptr += step;
+            exp_ptr += step;
+        }
+        // Reduce sum
+        qint32x2_t carry_addition = vqadd_qs32(vget_high_s32(vec_sum_value), vget_low_s32(vec_sum_value));
+        qint32_t   sum            = vget_lane_s32(carry_addition, 0) + vget_lane_s32(carry_addition, 1);
+
+        // Run remaining elements
+        for(int i = 0; i < small_steps; ++i)
+        {
+            qint16_t element = sqexp_qs16(sqsub_qs16(in_ptr[i], *max_ptr), fixed_point_position);
+            exp_ptr[i]       = element;
+            sum              = sqadd_qs32(sum, element);
+        }
+
+        *(reinterpret_cast<qint16_t *>(_sum.ptr())) = sqmovn_qs32(sum);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+void logits_1d_shift_exp_sum_f16(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window)
+{
+    Window window_max(window);
+    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    Window max_slice = window_max.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_1D();
+
+    constexpr int step        = 8;
+    const int     long_steps  = in->info()->valid_region().shape.x() / step;
+    const int     small_steps = in->info()->valid_region().shape.x() % step;
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator exp(out, in_slice);
+        Iterator _max(max, max_slice);
+        Iterator _sum(sum, max_slice);
+
+        // Get pointers
+        auto in_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
+        auto exp_ptr = reinterpret_cast<float16_t *>(exp.ptr());
+
+        // Init sum to zero
+        float16x8_t vec_sum_value = vdupq_n_f16(0);
+
+        // Get max value
+        const auto        max_ptr = reinterpret_cast<const float16_t *>(_max.ptr());
+        const float16x8_t vec_max = vdupq_n_f16(*max_ptr);
+
+        // Run neon loop
+        for(int i = 0; i < long_steps; ++i)
+        {
+            float16x8_t vec_elements = vld1q_f16(in_ptr);
+            vec_elements             = vsubq_f16(vec_elements, vec_max);
+            vec_elements             = vexpq_f16(vec_elements);
+
+            vst1q_f16(exp_ptr, vec_elements);
+            vec_sum_value = vaddq_f16(vec_sum_value, vec_elements);
+
+            in_ptr += step;
+            exp_ptr += step;
+        }
+        // Reduce sum
+        const float16x4_t sum_red        = vadd_f16(vget_low_f16(vec_sum_value), vget_high_f16(vec_sum_value));
+        const float16x4_t carry_addition = vpadd_f16(sum_red, sum_red);
+        float16_t         sum            = vget_lane_f16(carry_addition, 0) + vget_lane_f16(carry_addition, 1);
+
+        // Run remaining elements
+        for(int i = 0; i < small_steps; ++i)
+        {
+            const float16_t element = std::exp(static_cast<float>(in_ptr[i] - *max_ptr));
+            exp_ptr[i]              = element;
+            sum += element;
+        }
+        *(reinterpret_cast<float16_t *>(_sum.ptr())) = sum;
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
+void logits_1d_shift_exp_sum_f32(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window)
+{
+    Window window_max(window);
+    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    Window max_slice = window_max.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_1D();
+
+    constexpr int step        = 4;
+    const int     long_steps  = in->info()->valid_region().shape.x() / step;
+    const int     small_steps = in->info()->valid_region().shape.x() % step;
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator exp(out, in_slice);
+        Iterator _max(max, max_slice);
+        Iterator _sum(sum, max_slice);
+
+        // Get pointers
+        auto in_ptr  = reinterpret_cast<const float *>(input.ptr());
+        auto exp_ptr = reinterpret_cast<float *>(exp.ptr());
+
+        // Init sum to zero
+        float32x4_t vec_sum_value = vdupq_n_f32(0.0f);
+
+        // Get max value
+        const auto        max_ptr = reinterpret_cast<const float *>(_max.ptr());
+        const float32x4_t vec_max = vdupq_n_f32(*max_ptr);
+
+        // Run neon loop
+        for(int i = 0; i < long_steps; ++i)
+        {
+            float32x4_t vec_elements = vld1q_f32(in_ptr);
+            vec_elements             = vsubq_f32(vec_elements, vec_max);
+            vec_elements             = vexpq_f32(vec_elements);
+
+            vst1q_f32(exp_ptr, vec_elements);
+            vec_sum_value = vaddq_f32(vec_elements, vec_sum_value);
+
+            in_ptr += step;
+            exp_ptr += step;
+        }
+
+        // Reduce sum
+        float32x2_t carry_addition = vpadd_f32(vget_high_f32(vec_sum_value), vget_low_f32(vec_sum_value));
+        carry_addition             = vpadd_f32(carry_addition, carry_addition);
+        float sum                  = vget_lane_f32(carry_addition, 0);
+
+        // Run remaining elements
+        for(int i = 0; i < small_steps; ++i)
+        {
+            float element = std::exp(in_ptr[i] - *max_ptr);
+            exp_ptr[i]    = element;
+            sum += element;
+        }
+
+        *(reinterpret_cast<float *>(_sum.ptr())) = sum;
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
 } //namespace
 
 NELogits1DShiftExpSumKernel::NELogits1DShiftExpSumKernel()
@@ -302,11 +506,16 @@
 
 void NELogits1DShiftExpSumKernel::configure(const ITensor *input, const ITensor *max, ITensor *output, ITensor *sum)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(max, 1, DataType::F32, DataType::QS8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, max, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, max, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(max, sum, output);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*sum->info(), max->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, max, sum);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output, max, sum);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(max, sum);
 
     unsigned int num_elems_processed_per_iteration = input->info()->valid_region().shape.x();
@@ -316,11 +525,20 @@
         case DataType::QS8:
             _func = &logits_1d_shift_exp_sum_qs8;
             break;
+        case DataType::QS16:
+            _func = &logits_1d_shift_exp_sum_qs16;
+            break;
         case DataType::F32:
             _func = &logits_1d_shift_exp_sum_f32;
             break;
+        case DataType::F16:
+#ifdef ARM_COMPUTE_ENABLE_FP16
+            _func = &logits_1d_shift_exp_sum_f16;
+            break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
         default:
             ARM_COMPUTE_ERROR("Unsupported data type.");
+            break;
     }
 
     _input  = input;
@@ -343,8 +561,9 @@
     INEKernel::configure(win);
 }
 
-void NELogits1DShiftExpSumKernel::run(const Window &window)
+void NELogits1DShiftExpSumKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
@@ -354,36 +573,6 @@
 
 namespace
 {
-void logits_1d_norm_f32(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
-{
-    Window window_sum(window);
-    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
-    Window sum_slice = window_sum.first_slice_window_1D();
-    Window in_slice  = window.first_slice_window_1D();
-
-    do
-    {
-        Iterator input(in, in_slice);
-        Iterator _sum(sum, sum_slice);
-        Iterator output(out, in_slice);
-
-        const float       sum_value        = *reinterpret_cast<const float *>(_sum.ptr());
-        const float32x4_t vec_sum_inversed = vdupq_n_f32(1.0f / sum_value);
-
-        execute_window_loop(in_slice, [&](const Coordinates & id)
-        {
-            const auto in_ptr  = reinterpret_cast<const float *>(input.ptr());
-            const auto out_ptr = reinterpret_cast<float *>(output.ptr());
-
-            const float32x4_t vec_in           = vld1q_f32(in_ptr);
-            const float32x4_t normalized_value = vmulq_f32(vec_in, vec_sum_inversed);
-
-            vst1q_f32(out_ptr, normalized_value);
-        },
-        input, output);
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
-}
 void logits_1d_norm_qs8(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
 {
     Window window_sum(window);
@@ -416,6 +605,101 @@
     }
     while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
 }
+void logits_1d_norm_qs16(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
+{
+    Window window_sum(window);
+    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+    Window sum_slice = window_sum.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_1D();
+
+    const int fixed_point_position = in->info()->fixed_point_position();
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator _sum(sum, sum_slice);
+        Iterator output(out, in_slice);
+
+        const int16_t    sum_value        = *reinterpret_cast<const qint16_t *>(_sum.ptr());
+        const qint16x8_t vec_sum_inversed = vqrecipq_qs16(vdupq_n_qs16(sum_value), fixed_point_position);
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto in_ptr  = reinterpret_cast<const qint16_t *>(input.ptr());
+            const auto out_ptr = reinterpret_cast<qint16_t *>(output.ptr());
+
+            const qint16x8_t vec_in           = vld1q_qs16(in_ptr);
+            const qint16x8_t normalized_value = vqmulq_qs16(vec_in, vec_sum_inversed, fixed_point_position);
+
+            vst1q_qs16(out_ptr, normalized_value);
+        },
+        input, output);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+}
+#ifdef ARM_COMPUTE_ENABLE_FP16
+void logits_1d_norm_f16(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
+{
+    Window window_sum(window);
+    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+    Window sum_slice = window_sum.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_1D();
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator _sum(sum, sum_slice);
+        Iterator output(out, in_slice);
+
+        const float16_t   sum_value        = *reinterpret_cast<const qint16_t *>(_sum.ptr());
+        const float16x8_t vec_sum_inversed = vdupq_n_f16(1.0f / sum_value);
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto in_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
+            const auto out_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+            const float16x8_t vec_in           = vld1q_f16(in_ptr);
+            const float16x8_t normalized_value = vmulq_f16(vec_in, vec_sum_inversed);
+
+            vst1q_f16(out_ptr, normalized_value);
+        },
+        input, output);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
+void logits_1d_norm_f32(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
+{
+    Window window_sum(window);
+    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+    Window sum_slice = window_sum.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_1D();
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator _sum(sum, sum_slice);
+        Iterator output(out, in_slice);
+
+        const float       sum_value        = *reinterpret_cast<const float *>(_sum.ptr());
+        const float32x4_t vec_sum_inversed = vdupq_n_f32(1.0f / sum_value);
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto in_ptr  = reinterpret_cast<const float *>(input.ptr());
+            const auto out_ptr = reinterpret_cast<float *>(output.ptr());
+
+            const float32x4_t vec_in           = vld1q_f32(in_ptr);
+            const float32x4_t normalized_value = vmulq_f32(vec_in, vec_sum_inversed);
+
+            vst1q_f32(out_ptr, normalized_value);
+        },
+        input, output);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+}
 } // namespace
 
 NELogits1DNormKernel::NELogits1DNormKernel()
@@ -425,9 +709,14 @@
 
 void NELogits1DNormKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, sum);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, sum);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(sum, output);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, sum, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
 
     _input  = input;
@@ -435,20 +724,27 @@
     _output = output;
 
     // Configure kernel window
-    unsigned int num_elems_processed_per_iteration = 0;
+    unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type());
 
     switch(input->info()->data_type())
     {
         case DataType::QS8:
-            _func                             = &logits_1d_norm_qs8;
-            num_elems_processed_per_iteration = 16;
+            _func = &logits_1d_norm_qs8;
+            break;
+        case DataType::QS16:
+            _func = &logits_1d_norm_qs16;
             break;
         case DataType::F32:
-            num_elems_processed_per_iteration = 4;
-            _func                             = &logits_1d_norm_f32;
+            _func = &logits_1d_norm_f32;
             break;
+        case DataType::F16:
+#ifdef ARM_COMPUTE_ENABLE_FP16
+            _func = &logits_1d_norm_f16;
+            break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
         default:
             ARM_COMPUTE_ERROR("Unsupported data type.");
+            break;
     }
 
     Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
@@ -464,8 +760,9 @@
     INEKernel::configure(win);
 }
 
-void NELogits1DNormKernel::run(const Window &window)
+void NELogits1DNormKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);

diff --git a/src/core/NEON/kernels/NETableLookupKernel.cpp b/src/core/NEON/kernels/NETableLookupKernel.cpp
index f0b58d8..958f4a9 100644
--- a/src/core/NEON/kernels/NETableLookupKernel.cpp
+++ b/src/core/NEON/kernels/NETableLookupKernel.cpp

@@ -133,8 +133,9 @@
     INESimpleKernel::configure(input, output, num_num_elems_processed_per_iteration);
 }
 
-void NETableLookupKernel::run(const Window &window)
+void NETableLookupKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);

diff --git a/src/core/NEON/kernels/NEThresholdKernel.cpp b/src/core/NEON/kernels/NEThresholdKernel.cpp
index 7203119..5ef0693 100644
--- a/src/core/NEON/kernels/NEThresholdKernel.cpp
+++ b/src/core/NEON/kernels/NEThresholdKernel.cpp

@@ -119,8 +119,9 @@
     input, output);
 }
 
-void NEThresholdKernel::run(const Window &window)
+void NEThresholdKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);

diff --git a/src/core/NEON/kernels/NETransposeKernel.cpp b/src/core/NEON/kernels/NETransposeKernel.cpp
index 492de8a..1cfaafe 100644
--- a/src/core/NEON/kernels/NETransposeKernel.cpp
+++ b/src/core/NEON/kernels/NETransposeKernel.cpp

@@ -179,8 +179,9 @@
 
 void NETransposeKernel::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(output == nullptr);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16,
+                                                  DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
     TensorShape  output_shape{ input->info()->tensor_shape() };
     const size_t w_out = input->info()->dimension(1);
@@ -191,8 +192,9 @@
     // Output tensor auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
 
     _input  = input;
     _output = output;
@@ -231,8 +233,9 @@
     INEKernel::configure(win);
 }
 
-void NETransposeKernel::run(const Window &window)
+void NETransposeKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);

diff --git a/src/core/NEON/kernels/NEWarpKernel.cpp b/src/core/NEON/kernels/NEWarpKernel.cpp
index 6c90a33..62f4e5d 100644
--- a/src/core/NEON/kernels/NEWarpKernel.cpp
+++ b/src/core/NEON/kernels/NEWarpKernel.cpp

@@ -49,8 +49,14 @@
 {
 }
 
-void INEWarpKernel::run(const Window &window)
+BorderSize INEWarpKernel::border_size() const
 {
+    return BorderSize(1);
+}
+
+void INEWarpKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
@@ -93,9 +99,9 @@
 
     // Reads can occur within the valid region of the input
     AccessWindowStatic input_access(input->info(),
-                                    input_valid_region.anchor[0], input_valid_region.anchor[1],
-                                    input_valid_region.anchor[0] + input_valid_region.shape[0],
-                                    input_valid_region.anchor[1] + input_valid_region.shape[1]);
+                                    input_valid_region.anchor[0] - border_size().left, input_valid_region.anchor[1] - border_size().top,
+                                    input_valid_region.anchor[0] + input_valid_region.shape[0] + border_size().right,
+                                    input_valid_region.anchor[1] + input_valid_region.shape[1] + border_size().bottom);
     AccessWindowHorizontal output_access(output->info(), 0, 1);
 
     update_window_and_padding(win, input_access, output_access);
@@ -171,7 +177,7 @@
                     *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0);
+                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -250,7 +256,7 @@
                     *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0);
+                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -330,7 +336,7 @@
                     *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0);
+                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -390,7 +396,11 @@
     const float start_z0 = M20 * window.x().start();
 
     // Current row
-    int y_cur = window.y().start();
+    int y_cur  = window.y().start();
+    int z_cur  = window.z().start();
+    int d3_cur = window[3].start();
+    int d4_cur = window[4].start();
+    int d5_cur = window[5].start();
 
     // const_x0, const_y0 and const_z0 are the constant parts of x0, y0 and z0 during the row processing
     float const_x0 = M01 * y_cur + M02;
@@ -405,9 +415,13 @@
     execute_window_loop(window, [&](const Coordinates & id)
     {
         // Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0
-        if(y_cur != id.y())
+        if((y_cur != id.y()) || (z_cur != id.z()) || (d3_cur != id[3]) || (d4_cur != id[4]) || (d5_cur != id[5]))
         {
-            y_cur = id.y();
+            y_cur  = id.y();
+            z_cur  = id.z();
+            d3_cur = id[3];
+            d4_cur = id[4];
+            d5_cur = id[5];
 
             const_x0 = M01 * y_cur + M02;
             const_y0 = M11 * y_cur + M12;
@@ -431,7 +445,7 @@
                     *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn);
+                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -484,7 +498,11 @@
     const float start_z0 = M20 * window.x().start();
 
     // Current row
-    int y_cur = window.y().start();
+    int y_cur  = window.y().start();
+    int z_cur  = window.z().start();
+    int d3_cur = window[3].start();
+    int d4_cur = window[4].start();
+    int d5_cur = window[5].start();
 
     // const_x0, const_y0 and const_z0 are the constant parts of x0, y0 and z0 during the row processing
     float const_x0 = M01 * y_cur + M02;
@@ -498,10 +516,14 @@
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        // Check if we are processing a new row. If so, update the current row (y_cur), x0, y0 and z0
-        if(y_cur != id.y())
+        // Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0
+        if((y_cur != id.y()) || (z_cur != id.z()) || (d3_cur != id[3]) || (d4_cur != id[4]) || (d5_cur != id[5]))
         {
-            y_cur = id.y();
+            y_cur  = id.y();
+            z_cur  = id.z();
+            d3_cur = id[3];
+            d4_cur = id[4];
+            d5_cur = id[5];
 
             const_x0 = M01 * y_cur + M02;
             const_y0 = M11 * y_cur + M12;
@@ -516,7 +538,6 @@
         const float yn = y0 / z0;
 
         // Only use input values if xn and yn are within the valid region.
-        // Otherwise write the constant border value.
         if((min_y <= yn) && (yn < max_y) && (min_x <= xn) && (xn < max_x))
         {
             switch(interpolation)
@@ -525,7 +546,7 @@
                     *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn);
+                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -533,7 +554,34 @@
         }
         else
         {
-            *out.ptr() = _constant_border_value;
+            switch(interpolation)
+            {
+                case InterpolationPolicy::NEAREST_NEIGHBOR:
+                    *out.ptr() = _constant_border_value;
+                    break;
+                case InterpolationPolicy::BILINEAR:
+                {
+                    const auto xi   = clamp<int>(std::floor(xn), min_x - 1, max_x);
+                    const auto yi   = clamp<int>(std::floor(yn), min_y - 1, max_y);
+                    const auto xi_1 = clamp<int>(std::floor(xn + 1), min_x - 1, max_x);
+                    const auto yi_1 = clamp<int>(std::floor(yn + 1), min_y - 1, max_y);
+
+                    const float dx  = xn - std::floor(xn);
+                    const float dy  = yn - std::floor(yn);
+                    const float dx1 = 1.0f - dx;
+                    const float dy1 = 1.0f - dy;
+
+                    const float a00 = *(in.ptr() + xi + yi * stride);
+                    const float a01 = *(in.ptr() + xi_1 + yi * stride);
+                    const float a10 = *(in.ptr() + xi + yi_1 * stride);
+                    const float a11 = *(in.ptr() + xi_1 + yi_1 * stride);
+
+                    *out.ptr() = a00 * (dx1 * dy1) + a01 * (dx * dy1) + a10 * (dx1 * dy) + a11 * (dx * dy);
+                }
+                break;
+                default:
+                    ARM_COMPUTE_ERROR("Interpolation not supported");
+            }
         }
 
         x0 += M00;
@@ -562,7 +610,11 @@
     const size_t stride = _input->info()->strides_in_bytes()[1];
 
     // Current row
-    int y_cur = window.y().start();
+    int y_cur  = window.y().start();
+    int z_cur  = window.z().start();
+    int d3_cur = window[3].start();
+    int d4_cur = window[4].start();
+    int d5_cur = window[5].start();
 
     // x0 = M00 * x + M01 * y + M02
     // y0 = M10 * x + M11 * y + M12
@@ -596,10 +648,14 @@
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        // Check if we are processing a new row. If so, update the current row (y_cur), x0, y0 and z0
-        if(y_cur != id.y())
+        // Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0
+        if((y_cur != id.y()) || (z_cur != id.z()) || (d3_cur != id[3]) || (d4_cur != id[4]) || (d5_cur != id[5]))
         {
-            y_cur = id.y();
+            y_cur  = id.y();
+            z_cur  = id.z();
+            d3_cur = id[3];
+            d4_cur = id[4];
+            d5_cur = id[5];
 
             const_x0 = M01 * y_cur + M02;
             const_y0 = M11 * y_cur + M12;
@@ -614,7 +670,6 @@
         const float yn = y0 / z0;
 
         // Only load from (x0, y0) if the point is within the valid region.
-        // Otherwise load from the edge of the valid region.
         if((min_y <= yn) && (yn < max_y) && (min_x <= xn) && (xn < max_x))
         {
             switch(interpolation)
@@ -623,7 +678,7 @@
                     *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn);
+                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -632,10 +687,34 @@
         else
         {
             // Clamp coordinates
-            const auto xi = clamp<int>(x0, min_x, max_x - 1);
-            const auto yi = clamp<int>(y0, min_y, max_y - 1);
+            const auto xi = clamp<int>(std::floor(xn), min_x, max_x - 1);
+            const auto yi = clamp<int>(std::floor(yn), min_y, max_y - 1);
+            switch(interpolation)
+            {
+                case InterpolationPolicy::NEAREST_NEIGHBOR:
+                    *out.ptr() = *(in.ptr() + xi + yi * stride);
+                    break;
+                case InterpolationPolicy::BILINEAR:
+                {
+                    const auto xi_1 = clamp<int>(std::floor(xn + 1), min_x, max_x - 1);
+                    const auto yi_1 = clamp<int>(std::floor(yn + 1), min_y, max_y - 1);
 
-            *out.ptr() = *(in.ptr() + xi + yi * stride);
+                    const float dx  = xn - std::floor(xn);
+                    const float dy  = yn - std::floor(yn);
+                    const float dx1 = 1.0f - dx;
+                    const float dy1 = 1.0f - dy;
+
+                    const float a00 = *(in.ptr() + xi + yi * stride);
+                    const float a01 = *(in.ptr() + xi_1 + yi * stride);
+                    const float a10 = *(in.ptr() + xi + yi_1 * stride);
+                    const float a11 = *(in.ptr() + xi_1 + yi_1 * stride);
+
+                    *out.ptr() = a00 * (dx1 * dy1) + a01 * (dx * dy1) + a10 * (dx1 * dy) + a11 * (dx * dy);
+                }
+                break;
+                default:
+                    ARM_COMPUTE_ERROR("Interpolation not supported");
+            }
         }
 
         x0 += M00;

diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index aa6be44..d52e88c 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp

@@ -37,7 +37,8 @@
 template <typename T>
 void weights_reshape(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window)
 {
-    const unsigned int kernel_size     = input->info()->dimension(0);
+    const unsigned int kernel_size_x   = input->info()->dimension(0);
+    const unsigned int kernel_size_y   = input->info()->dimension(1);
     const unsigned int kernel_depth    = input->info()->dimension(2);
     const unsigned int input_stride_x  = input->info()->strides_in_bytes().x();
     const unsigned int input_stride_y  = input->info()->strides_in_bytes().y();
@@ -61,9 +62,9 @@
         // Linearize volume
         for(unsigned int d = 0; d < kernel_depth; ++d)
         {
-            for(unsigned int j = 0; j < kernel_size; ++j)
+            for(unsigned int j = 0; j < kernel_size_y; ++j)
             {
-                for(unsigned int i = 0; i < kernel_size; ++i)
+                for(unsigned int i = 0; i < kernel_size_x; ++i)
                 {
                     *(reinterpret_cast<T *>(tmp_output_ptr)) = *(reinterpret_cast<const T *>(tmp_input_ptr));
                     tmp_input_ptr += input_stride_x;
@@ -94,62 +95,60 @@
 
 void NEWeightsReshapeKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != input->info()->dimension(1));
 
-    const DataType dt                   = input->info()->data_type();
-    const int      fixed_point_position = input->info()->fixed_point_position();
-
-    TensorShape output_shape{ input->info()->tensor_shape() };
+    const int          fixed_point_position = input->info()->fixed_point_position();
+    const DataType     dt                   = input->info()->data_type();
+    const TensorShape &input_shape          = input->info()->tensor_shape();
+    TensorShape        output_shape{ input_shape };
     output_shape.collapse(3);
+
     const size_t tmp_dim = output_shape[0];
     output_shape.set(0, output_shape[1]);
     output_shape.set(1, tmp_dim + (bias != nullptr ? 1 : 0));
 
-    // Set data type and shape for output tensor if not yet configured
-    set_data_type_if_unknown(*output->info(), dt);
-    set_fixed_point_position_if_zero(*output->info(), fixed_point_position);
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, dt, fixed_point_position);
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
 
     if(bias != nullptr)
     {
-        TensorShape bias_shape{ input->info()->tensor_shape()[3] };
-
-        // Set data type and shape for bias tensor if not yet configured
-        set_data_type_if_unknown(*bias->info(), dt);
-        set_fixed_point_position_if_zero(*bias->info(), fixed_point_position);
-        set_shape_if_empty(*bias->info(), bias_shape);
-
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(bias->info()->tensor_shape(), bias_shape);
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::F32, DataType::QS8);
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, bias);
+        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (bias->info()->num_dimensions() != 1));
+        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (bias->info()->num_dimensions() != 2));
+        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (bias->info()->dimension(0) != input->info()->tensor_shape()[3]));
+        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (bias->info()->dimension(0) != input->info()->tensor_shape()[3] || bias->info()->dimension(1) != input->info()->tensor_shape()[4]));
     }
 
     _input  = input;
     _bias   = bias;
     _output = output;
 
-    switch(_input->info()->data_type())
+    switch(_input->info()->element_size())
     {
-        case DataType::F32:
+        case 4:
         {
             _func = &weights_reshape<uint32_t>;
             break;
         }
-        case DataType::QS8:
+        case 2:
+        {
+            _func = &weights_reshape<uint16_t>;
+            break;
+        }
+        case 1:
         {
             _func = &weights_reshape<uint8_t>;
             break;
         }
         default:
         {
-            ARM_COMPUTE_ERROR_ON("Data type not supported");
+            ARM_COMPUTE_ERROR_ON("Element size not supported");
             break;
         }
     }
@@ -166,8 +165,9 @@
     INEKernel::configure(window);
 }
 
-void NEWeightsReshapeKernel::run(const Window &window)
+void NEWeightsReshapeKernel::run(const Window &window, const ThreadInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 

diff --git a/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp b/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp
new file mode 100644
index 0000000..ad0743b
--- /dev/null
+++ b/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp

@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
+} // namespace arm_compute
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+namespace arm_compute
+{
+void NEGEMMAArch32Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+
+    _input0      = input0;
+    _input1      = input1;
+    _output      = output;
+    _workspace   = workspace;
+    _alpha       = alpha;
+    _beta        = beta;
+    _transform_0 = transform_0;
+    _transform_1 = transform_1;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info());
+
+    AccessWindowRectangle output_access(output->info(), 0, 0, 8, 6);
+
+    const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 6);
+    const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 8);
+
+    update_window_and_padding(win,
+                              AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
+                              AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
+                              output_access);
+
+    INEKernel::configure(win);
+}
+
+void NEGEMMAArch32Kernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    const int lda = _input0->info()->strides_in_bytes().y() / sizeof(float);
+    const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(float);
+    const int ldc = _output->info()->strides_in_bytes().y() / sizeof(float);
+
+    const auto in1_ptr = reinterpret_cast<const float *>(_input1->buffer());
+
+    const int M = std::min(_output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
+    const int N = _output->info()->tensor_shape().x();
+    const int K = _input0->info()->tensor_shape().x();
+
+    // Only iterate over batches
+    Window win(window);
+    win.set(0, Window::Dimension(0, 1, 1));
+    win.set(1, Window::Dimension(0, 1, 1));
+
+    Iterator in0(_input0, window);
+    Iterator out(_output, window);
+
+    GemmInterleaved<sgemm_8x6, float, float> gemm(&info.cpu_info, M, N, K, !_transform_0, !_transform_1);
+    constexpr size_t alignment      = 4096;
+    const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
+    void            *workspace      = _workspace->buffer() + offset;
+    size_t           workspace_size = _workspace->info()->total_size();
+
+    if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
+    {
+        ARM_COMPUTE_ERROR("Not enough space to align buffer!");
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        gemm.execute(reinterpret_cast<const float *>(in0.ptr()), lda,
+                     reinterpret_cast<const float *>(in1_ptr), ldb,
+                     reinterpret_cast<float *>(out.ptr()), ldc,
+                     _alpha, _beta, workspace);
+    },
+    in0, out);
+}
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp
new file mode 100644
index 0000000..d70524b
--- /dev/null
+++ b/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp

@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
+} // namespace arm_compute
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+namespace arm_compute
+{
+void NEGEMMAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+
+    _input0      = input0;
+    _input1      = input1;
+    _output      = output;
+    _workspace   = workspace;
+    _alpha       = alpha;
+    _beta        = beta;
+    _transform_0 = transform_0;
+    _transform_1 = transform_1;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info());
+
+    AccessWindowRectangle output_access(output->info(), 0, 0, 12, 8);
+
+    const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 8);
+    const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12);
+
+    update_window_and_padding(win,
+                              AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
+                              AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
+                              output_access);
+
+    INEKernel::configure(win);
+}
+
+void NEGEMMAArch64Kernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    const int lda = _input0->info()->strides_in_bytes().y() / sizeof(float);
+    const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(float);
+    const int ldc = _output->info()->strides_in_bytes().y() / sizeof(float);
+
+    const auto in1_ptr = reinterpret_cast<const float *>(_input1->buffer());
+
+    const int M = std::min(_output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
+    const int N = _output->info()->tensor_shape().x();
+    const int K = _input0->info()->tensor_shape().x();
+
+    // Only iterate over batches
+    Window win(window);
+    win.set(0, Window::Dimension(0, 1, 1));
+    win.set(1, Window::Dimension(0, 1, 1));
+
+    Iterator in0(_input0, window);
+    Iterator out(_output, window);
+
+    GemmInterleaved<sgemm_12x8, float, float> gemm(&info.cpu_info, M, N, K, !_transform_0, !_transform_1);
+    constexpr size_t alignment      = 4096;
+    const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
+    void            *workspace      = _workspace->buffer() + offset;
+    size_t           workspace_size = _workspace->info()->total_size();
+
+    if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
+    {
+        ARM_COMPUTE_ERROR("Not enough space to align buffer!");
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        gemm.execute(reinterpret_cast<const float *>(in0.ptr()), lda,
+                     reinterpret_cast<const float *>(in1_ptr), ldb,
+                     reinterpret_cast<float *>(out.ptr()), ldc,
+                     _alpha, _beta, workspace);
+    },
+    in0, out);
+}
+} // namespace arm_compute

diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index 3d07ccb..91a3531 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp

@@ -244,13 +244,13 @@
         {
             if(_tensor_shape.total_size() > 0)
             {
-                required_strides    = Strides(stride_x);
+                required_strides    = Strides(stride_x, stride_x);
                 required_total_size = stride_z;
             }
             break;
         }
         case 1:
-            required_strides    = compute_strides(*this, stride_x);
+            required_strides    = compute_strides(*this, stride_x, stride_y);
             required_total_size = stride_z;
             break;
         case 2:

diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index bf005c1..99d3956 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp

@@ -156,6 +156,8 @@
         { ActivationLayerInfo::ActivationFunction::LOGISTIC, "LOGISTIC" },
         { ActivationLayerInfo::ActivationFunction::RELU, "RELU" },
         { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, "BRELU" },
+        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, "LU_BRELU" },
+        { ActivationLayerInfo::ActivationFunction::LEAKY_RELU, "LRELU" },
         { ActivationLayerInfo::ActivationFunction::SOFT_RELU, "SRELU" },
         { ActivationLayerInfo::ActivationFunction::SQRT, "SQRT" },
         { ActivationLayerInfo::ActivationFunction::SQUARE, "SQUARE" },
@@ -226,6 +228,18 @@
     return norm_type_map[type];
 }
 
+const std::string &arm_compute::string_from_pooling_type(PoolingType type)
+{
+    static std::map<PoolingType, const std::string> pool_type_map =
+    {
+        { PoolingType::MAX, "MAX" },
+        { PoolingType::AVG, "AVG" },
+        { PoolingType::L2, "L2" },
+    };
+
+    return pool_type_map[type];
+}
+
 std::string arm_compute::lower_string(const std::string &val)
 {
     std::string res = val;
@@ -233,22 +247,25 @@
     return res;
 }
 
-const std::pair<unsigned int, unsigned int> arm_compute::scaled_dimensions(unsigned int width, unsigned int height, unsigned int kernel_size,
-                                                                           unsigned int stride_x, unsigned int stride_y,
-                                                                           unsigned int pad_x, unsigned int pad_y,
-                                                                           DimensionRoundingType round_type)
+const std::pair<unsigned int, unsigned int> arm_compute::scaled_dimensions(unsigned int width, unsigned int height,
+                                                                           unsigned int kernel_width, unsigned int kernel_height,
+                                                                           const PadStrideInfo &pad_stride_info)
 {
-    unsigned int w = 0;
-    unsigned int h = 0;
-    switch(round_type)
+    const unsigned int pad_x    = pad_stride_info.pad().first;
+    const unsigned int pad_y    = pad_stride_info.pad().second;
+    const unsigned int stride_x = pad_stride_info.stride().first;
+    const unsigned int stride_y = pad_stride_info.stride().second;
+    unsigned int       w        = 0;
+    unsigned int       h        = 0;
+    switch(pad_stride_info.round())
     {
         case DimensionRoundingType::FLOOR:
-            w = static_cast<unsigned int>(std::floor((static_cast<float>(width + 2 * pad_x - kernel_size) / stride_x) + 1));
-            h = static_cast<unsigned int>(std::floor((static_cast<float>(height + 2 * pad_y - kernel_size) / stride_y) + 1));
+            w = static_cast<unsigned int>(std::floor((static_cast<float>(width + 2 * pad_x - kernel_width) / stride_x) + 1));
+            h = static_cast<unsigned int>(std::floor((static_cast<float>(height + 2 * pad_y - kernel_height) / stride_y) + 1));
             break;
         case DimensionRoundingType::CEIL:
-            w = static_cast<unsigned int>(std::ceil((static_cast<float>(width + 2 * pad_x - kernel_size) / stride_x) + 1));
-            h = static_cast<unsigned int>(std::ceil((static_cast<float>(height + 2 * pad_y - kernel_size) / stride_y) + 1));
+            w = static_cast<unsigned int>(std::ceil((static_cast<float>(width + 2 * pad_x - kernel_width) / stride_x) + 1));
+            h = static_cast<unsigned int>(std::ceil((static_cast<float>(height + 2 * pad_y - kernel_height) / stride_y) + 1));
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported rounding type");
@@ -283,6 +300,7 @@
         case DataType::U16:
             print_consecutive_elements_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n, stream_width, element_delim);
             break;
+        case DataType::QS16:
         case DataType::S16:
             print_consecutive_elements_impl<int16_t>(s, reinterpret_cast<const int16_t *>(ptr), n, stream_width, element_delim);
             break;
@@ -313,6 +331,7 @@
             return max_consecutive_elements_display_width_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n);
         case DataType::U16:
             return max_consecutive_elements_display_width_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n);
+        case DataType::QS16:
         case DataType::S16:
             return max_consecutive_elements_display_width_impl<int16_t>(s, reinterpret_cast<const int16_t *>(ptr), n);
         case DataType::U32:

diff --git a/src/core/Validate.cpp b/src/core/Validate.cpp
index ae2841d..084a325 100644
--- a/src/core/Validate.cpp
+++ b/src/core/Validate.cpp

@@ -60,6 +60,22 @@
     }
 }
 
+void arm_compute::error_on_window_not_collapsable_at_dimension(const char *function, const char *file, const int line,
+                                                               const arm_compute::Window &full, const arm_compute::Window &window, const int dim)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+    ARM_COMPUTE_UNUSED(dim);
+
+    full.validate();
+    window.validate();
+
+    ARM_COMPUTE_ERROR_ON_LOC(window[dim].start() != 0, function, file, line);
+    ARM_COMPUTE_ERROR_ON_LOC(window[dim].start() != full[dim].start(), function, file, line);
+    ARM_COMPUTE_ERROR_ON_LOC(full[dim].end() != window[dim].end(), function, file, line);
+}
+
 void arm_compute::error_on_coordinates_dimensions_gte(const char *function, const char *file, const int line,
                                                       const arm_compute::Coordinates &pos, unsigned int max_dim)
 {
@@ -168,7 +184,7 @@
     ARM_COMPUTE_UNUSED(kernel);
 
     ARM_COMPUTE_ERROR_ON_LOC(kernel == nullptr, function, file, line);
-    ARM_COMPUTE_ERROR_ON_LOC_MSG((kernel->window().x().start() == kernel->window().x().end()) && (kernel->window().x().end() == 0),
+    ARM_COMPUTE_ERROR_ON_LOC_MSG((kernel->window().x().start() == kernel->window().x().end()) && (kernel->window().x().end() == 0) && (kernel->window().x().step() == 0),
                                  function, file, line,
                                  "This kernel hasn't been configured.");
 }

diff --git a/src/graph/CL/CLMap.cpp b/src/graph/CL/CLMap.cpp
new file mode 100644
index 0000000..4892b96
--- /dev/null
+++ b/src/graph/CL/CLMap.cpp

@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/CL/CLMap.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+using namespace arm_compute::graph;
+
+CLMap::CLMap(Tensor *tensor, bool blocking)
+    : _tensor(dynamic_cast<arm_compute::CLTensor *>(tensor->tensor())), _blocking(blocking)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
+}
+
+void CLMap::run()
+{
+    _tensor->map(_blocking);
+}

diff --git a/src/graph/CL/CLUnmap.cpp b/src/graph/CL/CLUnmap.cpp
new file mode 100644
index 0000000..ec7d865
--- /dev/null
+++ b/src/graph/CL/CLUnmap.cpp

@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/CL/CLUnmap.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+using namespace arm_compute::graph;
+
+CLUnmap::CLUnmap(Tensor *tensor)
+    : _tensor(dynamic_cast<arm_compute::CLTensor *>(tensor->tensor()))
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
+}
+
+void CLUnmap::run()
+{
+    _tensor->unmap();
+}

diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp
new file mode 100644
index 0000000..525506f
--- /dev/null
+++ b/src/graph/Graph.cpp

@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/Graph.h"
+
+#include "arm_compute/graph/CL/CLMap.h"
+#include "arm_compute/graph/CL/CLUnmap.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/Tensor.h"
+
+using namespace arm_compute::graph;
+
+struct Stage
+{
+    Tensor                                 *_input;
+    Tensor                                 *_output;
+    std::unique_ptr<arm_compute::IFunction> _function;
+};
+
+struct Graph::Private
+{
+public:
+    /** Finalizes the current node's configuration
+     *
+     * @param _next_hint Device execution hint
+     */
+    void configure(Hint _next_hint);
+
+    /** Sets whether to enable information print out
+     *
+     * @param[in] is_enabled Set to true if need info printed out
+     */
+    void set_info_enablement(bool is_enabled);
+
+    std::vector<Stage>                   _pipeline{};
+    std::vector<std::unique_ptr<Tensor>> _tensors{};
+    std::vector<std::unique_ptr<INode>>  _nodes{};
+    Hint                                 _current_hint{ Hint::DONT_CARE };
+    Hint                                 _next_hint{ Hint::DONT_CARE };
+    std::unique_ptr<Tensor>              _graph_input{ nullptr };
+    std::unique_ptr<Tensor>              _graph_output{ nullptr };
+    std::unique_ptr<INode>               _current_node{ nullptr };
+    Tensor                              *_current_output{ nullptr };
+    bool                                 _info_enabled{ false };
+
+private:
+    Tensor *_current_input{ nullptr };
+    Hint    _previous_hint{ Hint::DONT_CARE };
+};
+
+Graph::~Graph() //NOLINT
+{
+    //Can't use =default because the destructor must be defined after Graph::Private's definition
+}
+
+Graph::Graph()
+    : _pimpl{ new Private() }
+{
+}
+
+void Graph::run()
+{
+    while(true)
+    {
+        if(!_pimpl->_graph_input->call_accessor())
+        {
+            return;
+        }
+
+        for(auto &stage : _pimpl->_pipeline)
+        {
+            stage._function->run();
+        }
+
+        if(!_pimpl->_graph_output->call_accessor())
+        {
+            return;
+        }
+    }
+}
+
+//Finalize current node's configuration
+void Graph::Private::configure(Hint _next_hint)
+{
+    ARM_COMPUTE_ERROR_ON(_current_node == nullptr);
+    ARM_COMPUTE_ERROR_ON(_graph_input == nullptr);
+
+    // Is it the first node of the graph ?
+    if(_current_input == nullptr)
+    {
+        _graph_input->set_target(_current_hint);
+        _current_input = _graph_input.get();
+        _previous_hint = _current_hint; // For the first node just assume the previous node was of the same type as this one
+    }
+
+    //Automatic output configuration ?
+    if(_current_output == nullptr)
+    {
+        _tensors.push_back(arm_compute::support::cpp14::make_unique<Tensor>(TensorInfo()));
+        _current_output = _tensors.back().get();
+    }
+
+    // If either the writer or reader node needs OpenCL then use OpenCL memory:
+    if((_next_hint == Hint::OPENCL || _current_hint == Hint::OPENCL))
+    {
+        _current_output->set_target(Hint::OPENCL);
+    }
+    else
+    {
+        _current_output->set_target(Hint::NEON);
+    }
+
+    // Map input if needed
+    std::unique_ptr<arm_compute::IFunction> func = _current_node->instantiate_node(_current_hint, _current_input->tensor(), _current_output->tensor());
+    _current_input->allocate();
+
+    if(_current_input->target() == Hint::OPENCL)
+    {
+        if(_previous_hint == Hint::NEON)
+        {
+            ARM_COMPUTE_ERROR_ON(_current_hint == Hint::NEON);
+            _pipeline.push_back({ _current_input, _current_input, arm_compute::support::cpp14::make_unique<CLUnmap>(_current_input) });
+        }
+        if(_current_hint == Hint::NEON)
+        {
+            ARM_COMPUTE_ERROR_ON(_previous_hint == Hint::NEON);
+            _pipeline.push_back({ _current_input, _current_input, arm_compute::support::cpp14::make_unique<CLMap>(_current_input, true) });
+        }
+    }
+
+    _pipeline.push_back({ _current_input, _current_output, std::move(func) });
+
+    _current_input  = _current_output;
+    _current_output = nullptr;
+    _previous_hint  = _current_hint;
+    _current_hint   = _next_hint;
+}
+
+void Graph::Private::set_info_enablement(bool is_enabled)
+{
+    _info_enabled = is_enabled;
+}
+
+void Graph::add_node(std::unique_ptr<INode> node)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_pimpl->_graph_input == nullptr, "The graph's input must be set before the first node is added");
+    ARM_COMPUTE_ERROR_ON_MSG(_pimpl->_graph_output != nullptr, "Nothing can be added after the output tensor");
+    //Trigger the creation of the current Node:
+
+    Hint _next_hint = node->override_hint(_pimpl->_next_hint);
+    ARM_COMPUTE_ERROR_ON(_next_hint == Hint::DONT_CARE);
+    if(_pimpl->_current_node)
+    {
+        //Finalize the previous Node:
+        _pimpl->configure(_pimpl->_next_hint);
+
+        if(_pimpl->_info_enabled)
+        {
+            _pimpl->_current_node->print_info();
+        }
+    }
+    else
+    {
+        // If that's the first node then use the same Hint before and after the node.
+        _pimpl->_current_hint = _next_hint;
+    }
+    if(_pimpl->_current_node)
+    {
+        _pimpl->_nodes.push_back(std::move(_pimpl->_current_node));
+    }
+    _pimpl->_current_node = std::move(node);
+}
+void Graph::set_hint(Hint hint)
+{
+    _pimpl->_next_hint = hint;
+}
+
+void Graph::set_info_enablement(bool is_enabled)
+{
+    _pimpl->set_info_enablement(is_enabled);
+}
+
+//Add a tensor with an Accessor (i.e either the input or output of the graph)
+void Graph::add_tensor(std::unique_ptr<Tensor> tensor)
+{
+    // If it's the first Tensor added then it will be the input of the Graph.
+    if(_pimpl->_graph_input == nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON(_pimpl->_graph_output != nullptr);
+        ARM_COMPUTE_ERROR_ON(_pimpl->_current_node != nullptr);
+        _pimpl->_graph_input = std::move(tensor);
+    }
+    else
+    {
+        // Else it will be the output of the Graph
+        ARM_COMPUTE_ERROR_ON(_pimpl->_graph_output != nullptr);
+        ARM_COMPUTE_ERROR_ON(_pimpl->_current_node == nullptr);
+        _pimpl->_graph_output   = std::move(tensor);
+        _pimpl->_current_output = _pimpl->_graph_output.get();
+
+        // Finalize the graph by configuring the last Node of the graph:
+        _pimpl->configure(_pimpl->_current_hint); // Ignore _next_hint as this is the last node, and just use the same hint as before this node.
+        _pimpl->_graph_output->allocate();
+    }
+}
+
+void Graph::set_temp(TensorInfo &&tmp)
+{
+    ARM_COMPUTE_ERROR_ON(_pimpl->_graph_input == nullptr);
+    ARM_COMPUTE_ERROR_ON(_pimpl->_graph_output != nullptr);
+    ARM_COMPUTE_ERROR_ON_MSG(_pimpl->_current_output != nullptr, "TensorInfo for temporary tensor already set");
+
+    _pimpl->_tensors.push_back(arm_compute::support::cpp14::make_unique<Tensor>(std::move(tmp)));
+    _pimpl->_current_output = _pimpl->_tensors.back().get();
+}
+
+Graph &arm_compute::graph::operator<<(Graph &graph, TensorInfo &&info)
+{
+    graph.set_temp(std::move(info));
+    return graph;
+}
+
+Graph &arm_compute::graph::operator<<(Graph &graph, Tensor &&tensor)
+{
+    graph.add_tensor(arm_compute::support::cpp14::make_unique<Tensor>(std::move(tensor)));
+    return graph;
+}
+
+Graph &arm_compute::graph::operator<<(Graph &graph, Hint hint)
+{
+    graph.set_hint(hint);
+    return graph;
+}

diff --git a/src/graph/INode.cpp b/src/graph/INode.cpp
new file mode 100644
index 0000000..6b25022
--- /dev/null
+++ b/src/graph/INode.cpp

@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/graph/INode.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Validate.h"
+
+#include <ostream>
+
+using namespace arm_compute::graph;
+
+Hint INode::override_hint(Hint hint) const
+{
+    if(hint == Hint::OPENCL && !opencl_is_available())
+    {
+        hint = Hint::DONT_CARE;
+    }
+    hint = node_override_hint(hint);
+    ARM_COMPUTE_ERROR_ON(hint == Hint::OPENCL && !opencl_is_available());
+    return hint;
+}
+Hint INode::node_override_hint(Hint hint) const
+{
+    return hint == Hint::DONT_CARE ? Hint::NEON : hint;
+}

diff --git a/src/graph/Tensor.cpp b/src/graph/Tensor.cpp
new file mode 100644
index 0000000..c534ae0
--- /dev/null
+++ b/src/graph/Tensor.cpp

@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/graph/Tensor.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "utils/TypePrinter.h"
+
+using namespace arm_compute::graph;
+
+namespace
+{
+template <typename TensorType>
+std::unique_ptr<ITensor> initialise_tensor(TensorInfo &info)
+{
+    auto tensor = arm_compute::support::cpp14::make_unique<TensorType>();
+    tensor->allocator()->init(info);
+    return std::move(tensor);
+}
+
+template <typename TensorType>
+void tensor_allocate(ITensor &tensor)
+{
+    auto itensor = dynamic_cast<TensorType *>(&tensor);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(itensor);
+    itensor->allocator()->allocate();
+}
+} // namespace
+
+Tensor::Tensor(TensorInfo &&info)
+    : _target(Hint::DONT_CARE), _info(info), _accessor(nullptr), _tensor(nullptr)
+{
+}
+
+Tensor::Tensor(Tensor &&src) noexcept
+    : _target(src._target),
+      _info(std::move(src._info)),
+      _accessor(std::move(src._accessor)),
+      _tensor(std::move(src._tensor))
+{
+}
+
+void Tensor::set_info(TensorInfo &&info)
+{
+    _info = info;
+}
+
+bool Tensor::call_accessor()
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_accessor.get());
+    auto cl_tensor = dynamic_cast<arm_compute::CLTensor *>(_tensor.get());
+    if(cl_tensor != nullptr && cl_tensor->buffer() == nullptr)
+    {
+        cl_tensor->map();
+    }
+    bool retval = _accessor->access_tensor(*_tensor);
+    if(cl_tensor != nullptr)
+    {
+        cl_tensor->unmap();
+    }
+    return retval;
+}
+
+ITensor *Tensor::tensor()
+{
+    return _tensor.get();
+}
+
+const TensorInfo &Tensor::info() const
+{
+    return _info;
+}
+
+ITensor *Tensor::set_target(Hint target)
+{
+    if(_tensor != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON(target != _target);
+    }
+    else
+    {
+        switch(target)
+        {
+            case Hint::OPENCL:
+                _tensor = initialise_tensor<arm_compute::CLTensor>(_info);
+                break;
+            case Hint::NEON:
+                _tensor = initialise_tensor<arm_compute::Tensor>(_info);
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Invalid Hint");
+        }
+        _target = target;
+    }
+    return _tensor.get();
+}
+
+void Tensor::allocate()
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor.get());
+    switch(_target)
+    {
+        case Hint::OPENCL:
+            tensor_allocate<arm_compute::CLTensor>(*_tensor);
+            break;
+        case Hint::NEON:
+            tensor_allocate<arm_compute::Tensor>(*_tensor);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Invalid Hint");
+    }
+}
+
+void Tensor::allocate_and_fill_if_needed()
+{
+    allocate();
+    if(_accessor != nullptr)
+    {
+        call_accessor();
+    }
+}
+
+Hint Tensor::target() const
+{
+    return _target;
+}

diff --git a/src/graph/nodes/ActivationLayer.cpp b/src/graph/nodes/ActivationLayer.cpp
new file mode 100644
index 0000000..b71e22c
--- /dev/null
+++ b/src/graph/nodes/ActivationLayer.cpp

@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/ActivationLayer.h"
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
+#include "utils/TypePrinter.h"
+
+using namespace arm_compute::graph;
+
+namespace
+{
+template <typename ActivationType, typename TensorType, Hint hint>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info)
+{
+    auto activation = arm_compute::support::cpp14::make_unique<ActivationType>();
+    activation->configure(
+        dynamic_cast<TensorType *>(input),
+        dynamic_cast<TensorType *>(output),
+        activation_info);
+
+    return std::move(activation);
+}
+
+template <Hint                          hint>
+std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info);
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<Hint::OPENCL>(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info)
+{
+    return instantiate_function<arm_compute::CLActivationLayer, arm_compute::CLTensor, Hint::OPENCL>(input, output, activation_info);
+}
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<Hint::NEON>(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info)
+{
+    return instantiate_function<arm_compute::NEActivationLayer, arm_compute::Tensor, Hint::NEON>(input, output, activation_info);
+}
+} // namespace
+
+ActivationLayer::ActivationLayer(const ActivationLayerInfo activation_info)
+    : _activation_info(activation_info)
+{
+}
+
+std::unique_ptr<arm_compute::IFunction> ActivationLayer::instantiate_node(Hint hint, ITensor *input, ITensor *output)
+{
+    std::unique_ptr<arm_compute::IFunction> func;
+    _hint   = hint;
+    _input  = input;
+    _output = output;
+
+    if(_hint == Hint::OPENCL)
+    {
+        func = instantiate<Hint::OPENCL>(input, output, _activation_info);
+    }
+    else
+    {
+        func = instantiate<Hint::NEON>(input, output, _activation_info);
+    }
+    return func;
+}
+
+void ActivationLayer::print_info()
+{
+    if(_hint == Hint::OPENCL)
+    {
+        std::cout << "Instantiating CLActivationLayer";
+    }
+    else
+    {
+        std::cout << "Instantiating NEActivationLayer";
+    }
+
+    std::cout << " Data Type: " << _input->info()->data_type()
+              << " Input shape: " << _input->info()->tensor_shape()
+              << " Output shape: " << _output->info()->tensor_shape()
+              << " Activation function: " << _activation_info.activation()
+              << " a: " << _activation_info.a()
+              << " b: " << _activation_info.b()
+              << std::endl;
+}

diff --git a/src/graph/nodes/ConvolutionLayer.cpp b/src/graph/nodes/ConvolutionLayer.cpp
new file mode 100644
index 0000000..b80bf93
--- /dev/null
+++ b/src/graph/nodes/ConvolutionLayer.cpp

@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/ConvolutionLayer.h"
+
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "support/ToolchainSupport.h"
+#include "utils/TypePrinter.h"
+
+using namespace arm_compute::graph;
+
+namespace
+{
+template <typename ConvolutionType, typename TensorType, Hint hint>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+{
+    bool weights_are_loaded = weights.tensor() != nullptr;
+    bool biases_are_loaded  = biases.tensor() != nullptr;
+
+    auto conv = arm_compute::support::cpp14::make_unique<ConvolutionType>();
+    conv->configure(
+        dynamic_cast<TensorType *>(input),
+        dynamic_cast<TensorType *>(weights.set_target(hint)),
+        dynamic_cast<TensorType *>(biases.set_target(hint)),
+        dynamic_cast<TensorType *>(output),
+        conv_info, weights_info);
+    if(!weights_are_loaded)
+    {
+        weights.allocate_and_fill_if_needed();
+    }
+    if(!biases_are_loaded)
+    {
+        biases.allocate_and_fill_if_needed();
+    }
+
+    return std::move(conv);
+}
+
+template <Hint                          hint>
+std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info);
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<Hint::OPENCL>(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+{
+    return instantiate_function<arm_compute::CLConvolutionLayer, arm_compute::CLTensor, Hint::OPENCL>(input, weights, biases, output, conv_info, weights_info);
+}
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<Hint::NEON>(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+{
+    return instantiate_function<arm_compute::NEConvolutionLayer, arm_compute::Tensor, Hint::NEON>(input, weights, biases, output, conv_info, weights_info);
+}
+} // namespace
+
+std::unique_ptr<arm_compute::IFunction> ConvolutionLayer::instantiate_node(Hint hint, ITensor *input, ITensor *output)
+{
+    if(_weights.tensor() == nullptr)
+    {
+        _weights.set_info(TensorInfo(TensorShape(_conv_width, _conv_height, input->info()->dimension(2), _ofm), input->info()->num_channels(), input->info()->data_type(),
+                                     input->info()->fixed_point_position()));
+    }
+    if(_biases.tensor() == nullptr)
+    {
+        _biases.set_info(TensorInfo(TensorShape(_ofm), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+    }
+
+    std::unique_ptr<arm_compute::IFunction> func;
+    _hint   = hint;
+    _input  = input;
+    _output = output;
+
+    if(_hint == Hint::OPENCL)
+    {
+        func = instantiate<Hint::OPENCL>(input, _weights, _biases, output, _conv_info, _weights_info);
+    }
+    else
+    {
+        func = instantiate<Hint::NEON>(input, _weights, _biases, output, _conv_info, _weights_info);
+    }
+
+    return func;
+}
+
+void ConvolutionLayer::print_info()
+{
+    if(_hint == Hint::OPENCL)
+    {
+        std::cout << "Instantiating CLConvolutionLayer";
+    }
+    else
+    {
+        std::cout << "Instantiating NEConvolutionLayer";
+    }
+    std::cout << " Type: " << _input->info()->data_type() << " Input Shape: " << _input->info()->tensor_shape() << " Weights shape: " << _weights.info().tensor_shape() << " Biases Shape: " <<
+              _biases.info().tensor_shape() << " Output Shape: " << _output->info()->tensor_shape() << " PadStrideInfo: " << _conv_info << "WeightsInfo: " << _weights_info << std::endl;
+}

diff --git a/src/graph/nodes/FullyConnectedLayer.cpp b/src/graph/nodes/FullyConnectedLayer.cpp
new file mode 100644
index 0000000..8d244cb
--- /dev/null
+++ b/src/graph/nodes/FullyConnectedLayer.cpp

@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/FullyConnectedLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "support/ToolchainSupport.h"
+#include "utils/TypePrinter.h"
+
+using namespace arm_compute::graph;
+
+namespace
+{
+template <typename FullyConnectedType, typename TensorType, Hint hint>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output)
+{
+    bool weights_are_loaded = weights.tensor() != nullptr;
+    bool biases_are_loaded  = biases.tensor() != nullptr;
+
+    auto conv = arm_compute::support::cpp14::make_unique<FullyConnectedType>();
+    conv->configure(
+        dynamic_cast<TensorType *>(input),
+        dynamic_cast<TensorType *>(weights.set_target(hint)),
+        dynamic_cast<TensorType *>(biases.set_target(hint)),
+        dynamic_cast<TensorType *>(output));
+    if(!weights_are_loaded)
+    {
+        weights.allocate_and_fill_if_needed();
+    }
+    if(!biases_are_loaded)
+    {
+        biases.allocate_and_fill_if_needed();
+    }
+
+    return std::move(conv);
+}
+
+template <Hint                          hint>
+std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output);
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<Hint::OPENCL>(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output)
+{
+    return instantiate_function<arm_compute::CLFullyConnectedLayer, arm_compute::CLTensor, Hint::OPENCL>(input, weights, biases, output);
+}
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<Hint::NEON>(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output)
+{
+    return instantiate_function<arm_compute::NEFullyConnectedLayer, arm_compute::Tensor, Hint::NEON>(input, weights, biases, output);
+}
+} // namespace
+
+std::unique_ptr<arm_compute::IFunction> FullyConnectedLayer::instantiate_node(Hint hint, ITensor *input, ITensor *output)
+{
+    if(_weights.tensor() == nullptr)
+    {
+        unsigned int num_weights    = 1;
+        unsigned int num_dimensions = input->info()->num_dimensions();
+        // Ignore the batch dimension if there is one:
+        if(num_dimensions == 2 || num_dimensions == 4)
+        {
+            num_dimensions--;
+        }
+        for(unsigned int i = 0; i < num_dimensions; i++)
+        {
+            num_weights *= input->info()->dimension(i);
+        }
+        _weights.set_info(TensorInfo(TensorShape(num_weights, _num_neurons), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+    }
+    if(_biases.tensor() == nullptr)
+    {
+        _biases.set_info(TensorInfo(TensorShape(_num_neurons), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+    }
+
+    arm_compute::auto_init_if_empty(*output->info(), TensorShape(_num_neurons, input->info()->dimension(1)), input->info()->num_channels(), input->info()->data_type(),
+                                    input->info()->fixed_point_position());
+
+    std::unique_ptr<arm_compute::IFunction> func;
+    _hint   = hint;
+    _input  = input;
+    _output = output;
+
+    if(_hint == Hint::OPENCL)
+    {
+        func = instantiate<Hint::OPENCL>(input, _weights, _biases, output);
+    }
+    else
+    {
+        func = instantiate<Hint::NEON>(input, _weights, _biases, output);
+    }
+
+    return func;
+}
+
+void FullyConnectedLayer::print_info()
+{
+    if(_hint == Hint::OPENCL)
+    {
+        std::cout << "Instantiating CLFullyConnectedLayer";
+    }
+    else
+    {
+        std::cout << "Instantiating NEFullyConnectedLayer";
+    }
+    std::cout << " Type: " << _input->info()->data_type() << " Input Shape: " << _input->info()->tensor_shape() << " Weights shape: " << _weights.info().tensor_shape() << " Biases Shape: " <<
+              _biases.info().tensor_shape() << " Output Shape: " << _output->info()->tensor_shape() << std::endl;
+}

diff --git a/src/graph/nodes/PoolingLayer.cpp b/src/graph/nodes/PoolingLayer.cpp
new file mode 100644
index 0000000..f29332f
--- /dev/null
+++ b/src/graph/nodes/PoolingLayer.cpp

@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/PoolingLayer.h"
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
+#include "utils/TypePrinter.h"
+
+using namespace arm_compute::graph;
+
+namespace
+{
+template <typename PoolingType, typename TensorType, Hint hint>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
+{
+    auto pool = arm_compute::support::cpp14::make_unique<PoolingType>();
+    pool->configure(
+        dynamic_cast<TensorType *>(input),
+        dynamic_cast<TensorType *>(output),
+        pool_info);
+
+    return std::move(pool);
+}
+
+template <Hint                          hint>
+std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info);
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<Hint::OPENCL>(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
+{
+    return instantiate_function<arm_compute::CLPoolingLayer, arm_compute::CLTensor, Hint::OPENCL>(input, output, pool_info);
+}
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<Hint::NEON>(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
+{
+    return instantiate_function<arm_compute::NEPoolingLayer, arm_compute::Tensor, Hint::NEON>(input, output, pool_info);
+}
+} // namespace
+
+PoolingLayer::PoolingLayer(const PoolingLayerInfo pool_info)
+    : _pool_info(pool_info)
+{
+}
+
+std::unique_ptr<arm_compute::IFunction> PoolingLayer::instantiate_node(Hint hint, ITensor *input, ITensor *output)
+{
+    std::unique_ptr<arm_compute::IFunction> func;
+    _hint   = hint;
+    _input  = input;
+    _output = output;
+
+    if(_hint == Hint::OPENCL)
+    {
+        func = instantiate<Hint::OPENCL>(input, output, _pool_info);
+    }
+    else
+    {
+        func = instantiate<Hint::NEON>(input, output, _pool_info);
+    }
+
+    return func;
+}
+
+void PoolingLayer::print_info()
+{
+    if(_hint == Hint::OPENCL)
+    {
+        std::cout << "Instantiating CLPoolingLayer";
+    }
+    else
+    {
+        std::cout << "Instantiating NEPoolingLayer";
+    }
+
+    std::cout << " Data Type: " << _input->info()->data_type()
+              << " Input shape: " << _input->info()->tensor_shape()
+              << " Output shape: " << _output->info()->tensor_shape()
+              << " Pooling info: " << _pool_info << std::endl;
+}

diff --git a/src/graph/nodes/SoftmaxLayer.cpp b/src/graph/nodes/SoftmaxLayer.cpp
new file mode 100644
index 0000000..fee8897
--- /dev/null
+++ b/src/graph/nodes/SoftmaxLayer.cpp

@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/SoftmaxLayer.h"
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
+#include "utils/TypePrinter.h"
+
+using namespace arm_compute::graph;
+
+namespace
+{
+template <typename SoftmaxType, typename TensorType, Hint hint>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output)
+{
+    auto softmax = arm_compute::support::cpp14::make_unique<SoftmaxType>();
+    softmax->configure(
+        dynamic_cast<TensorType *>(input),
+        dynamic_cast<TensorType *>(output));
+
+    return std::move(softmax);
+}
+
+template <Hint                          hint>
+std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output);
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<Hint::OPENCL>(ITensor *input, ITensor *output)
+{
+    return instantiate_function<arm_compute::CLSoftmaxLayer, arm_compute::CLTensor, Hint::OPENCL>(input, output);
+}
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<Hint::NEON>(ITensor *input, ITensor *output)
+{
+    return instantiate_function<arm_compute::NESoftmaxLayer, arm_compute::Tensor, Hint::NEON>(input, output);
+}
+} // namespace
+
+std::unique_ptr<arm_compute::IFunction> SoftmaxLayer::instantiate_node(Hint hint, ITensor *input, ITensor *output)
+{
+    std::unique_ptr<arm_compute::IFunction> func;
+    _hint   = hint;
+    _input  = input;
+    _output = output;
+
+    if(_hint == Hint::OPENCL)
+    {
+        func = instantiate<Hint::OPENCL>(input, output);
+    }
+    else
+    {
+        func = instantiate<Hint::NEON>(input, output);
+    }
+
+    return func;
+}
+
+void SoftmaxLayer::print_info()
+{
+    if(_hint == Hint::OPENCL)
+    {
+        std::cout << "Instantiating CLSoftmaxLayer";
+    }
+    else
+    {
+        std::cout << "Instantiating NESoftmaxLayer";
+    }
+    std::cout << " Data Type: " << _input->info()->data_type()
+              << " Input shape: " << _input->info()->tensor_shape()
+              << " Output shape: " << _output->info()->tensor_shape()
+              << std::endl;
+}

diff --git a/src/runtime/Allocator.cpp b/src/runtime/Allocator.cpp
new file mode 100644
index 0000000..50b0f0e
--- /dev/null
+++ b/src/runtime/Allocator.cpp

@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Allocator.h"
+
+#include "arm_compute/core/Error.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+void *Allocator::allocate(size_t size, size_t alignment)
+{
+    ARM_COMPUTE_UNUSED(alignment);
+    return ::operator new(size);
+}
+
+void Allocator::free(void *ptr)
+{
+    ::operator delete(ptr);
+}

diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp
new file mode 100644
index 0000000..69292b9
--- /dev/null
+++ b/src/runtime/BlobLifetimeManager.cpp

@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/BlobLifetimeManager.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/BlobMemoryPool.h"
+#include "arm_compute/runtime/IAllocator.h"
+#include "arm_compute/runtime/IMemoryGroup.h"
+#include "support/ToolchainSupport.h"
+
+#include <algorithm>
+#include <cmath>
+#include <map>
+#include <vector>
+
+using namespace arm_compute;
+
+BlobLifetimeManager::BlobLifetimeManager()
+    : _active_group(nullptr), _active_elements(), _finalized_groups(), _blobs()
+{
+}
+
+void BlobLifetimeManager::register_group(IMemoryGroup *group)
+{
+    if(_active_group == nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON(group == nullptr);
+        _active_group = group;
+    }
+}
+
+void BlobLifetimeManager::start_lifetime(void *obj)
+{
+    ARM_COMPUTE_ERROR_ON(obj == nullptr);
+    ARM_COMPUTE_ERROR_ON_MSG(std::find_if(std::begin(_active_elements), std::end(_active_elements), [&obj](const Element & e)
+    {
+        return obj == e.id;
+    }) != std::end(_active_elements),
+    "Memory object is already registered!");
+
+    // Insert object in groups and mark its finalized state to false
+    _active_elements.emplace_back(obj);
+}
+
+void BlobLifetimeManager::end_lifetime(void *obj, void **handle, size_t size)
+{
+    ARM_COMPUTE_ERROR_ON(obj == nullptr);
+
+    // Find object
+    auto it = std::find_if(std::begin(_active_elements), std::end(_active_elements), [&obj](const Element & e)
+    {
+        return obj == e.id;
+    });
+    ARM_COMPUTE_ERROR_ON(it == std::end(_active_elements));
+
+    // Update object fields and mark object as complete
+    it->handle = handle;
+    it->size   = size;
+    it->status = true;
+
+    // Check if all object are finalized and reset active group
+    if(are_all_finalized())
+    {
+        // Update finalized groups
+        _finalized_groups[_active_group].insert(std::end(_finalized_groups[_active_group]), std::begin(_active_elements), std::end(_active_elements));
+
+        // Update blobs and group mappings
+        update_blobs_and_mappings();
+
+        // Reset state
+        _active_elements.clear();
+        _active_group = nullptr;
+    }
+}
+
+std::unique_ptr<IMemoryPool> BlobLifetimeManager::create_pool(IAllocator *allocator)
+{
+    ARM_COMPUTE_ERROR_ON(allocator == nullptr);
+    return support::cpp14::make_unique<BlobMemoryPool>(allocator, _blobs);
+}
+
+bool BlobLifetimeManager::are_all_finalized() const
+{
+    return !std::any_of(std::begin(_active_elements), std::end(_active_elements), [](const Element e)
+    {
+        return !e.status;
+    });
+}
+
+MappingType BlobLifetimeManager::mapping_type() const
+{
+    return MappingType::BLOBS;
+}
+
+void BlobLifetimeManager::update_blobs_and_mappings()
+{
+    ARM_COMPUTE_ERROR_ON(!are_all_finalized());
+    ARM_COMPUTE_ERROR_ON(_active_group == nullptr);
+
+    // Sort active group requirements in descending order
+    std::sort(std::begin(_active_elements), std::end(_active_elements), [](const Element & a, const Element & b)
+    {
+        return a.size > b.size;
+    });
+    std::vector<size_t> group_sizes;
+    std::transform(std::begin(_active_elements), std::end(_active_elements), std::back_inserter(group_sizes), [](const Element & e)
+    {
+        return e.size;
+    });
+
+    // Update blob sizes
+    size_t max_size = std::max(_blobs.size(), group_sizes.size());
+    _blobs.resize(max_size, 0);
+    group_sizes.resize(max_size, 0);
+    std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), [](size_t lhs, size_t rhs)
+    {
+        return std::max(lhs, rhs);
+    });
+
+    // Calculate group mappings
+    auto &group_mappings = _active_group->mappings();
+    int   blob_idx       = 0;
+    for(auto &e : _active_elements)
+    {
+        group_mappings[e.handle] = blob_idx++;
+    }
+}

diff --git a/src/runtime/BlobMemoryPool.cpp b/src/runtime/BlobMemoryPool.cpp
new file mode 100644
index 0000000..29505e5
--- /dev/null
+++ b/src/runtime/BlobMemoryPool.cpp

@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/BlobMemoryPool.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/IAllocator.h"
+#include "arm_compute/runtime/IMemoryPool.h"
+#include "arm_compute/runtime/Types.h"
+#include "support/ToolchainSupport.h"
+
+#include <vector>
+
+using namespace arm_compute;
+
+BlobMemoryPool::BlobMemoryPool(IAllocator *allocator, std::vector<size_t> blob_sizes)
+    : _allocator(allocator), _blobs(), _blob_sizes(std::move(blob_sizes))
+{
+    ARM_COMPUTE_ERROR_ON(!allocator);
+    allocate_blobs(_blob_sizes);
+}
+
+BlobMemoryPool::~BlobMemoryPool()
+{
+    ARM_COMPUTE_ERROR_ON(!_allocator);
+    free_blobs();
+}
+
+void BlobMemoryPool::acquire(MemoryMappings &handles)
+{
+    // Set memory to handlers
+    for(auto &handle : handles)
+    {
+        ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
+        *handle.first = _blobs[handle.second];
+    }
+}
+
+void BlobMemoryPool::release(MemoryMappings &handles)
+{
+    for(auto &handle : handles)
+    {
+        ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
+        *handle.first = nullptr;
+    }
+}
+
+MappingType BlobMemoryPool::mapping_type() const
+{
+    return MappingType::BLOBS;
+}
+
+std::unique_ptr<IMemoryPool> BlobMemoryPool::duplicate()
+{
+    ARM_COMPUTE_ERROR_ON(!_allocator);
+    return support::cpp14::make_unique<BlobMemoryPool>(_allocator, _blob_sizes);
+}
+
+void BlobMemoryPool::allocate_blobs(const std::vector<size_t> &sizes)
+{
+    ARM_COMPUTE_ERROR_ON(!_allocator);
+
+    for(const auto &size : sizes)
+    {
+        _blobs.push_back(_allocator->allocate(size, 0));
+    }
+}
+
+void BlobMemoryPool::free_blobs()
+{
+    ARM_COMPUTE_ERROR_ON(!_allocator);
+
+    for(auto &blob : _blobs)
+    {
+        _allocator->free(blob);
+    }
+    _blobs.clear();
+}
\ No newline at end of file

diff --git a/src/runtime/CL/CLBufferAllocator.cpp b/src/runtime/CL/CLBufferAllocator.cpp
new file mode 100644
index 0000000..9a5c13a
--- /dev/null
+++ b/src/runtime/CL/CLBufferAllocator.cpp

@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLBufferAllocator.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+CLBufferAllocator::CLBufferAllocator(cl::Context context)
+    : _context(std::move(context))
+{
+}
+
+void *CLBufferAllocator::allocate(size_t size, size_t alignment)
+{
+    ARM_COMPUTE_UNUSED(alignment);
+    cl_mem buf = clCreateBuffer(_context.get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, nullptr, nullptr);
+    return static_cast<void *>(buf);
+}
+
+void CLBufferAllocator::free(void *ptr)
+{
+    ARM_COMPUTE_ERROR_ON(ptr == nullptr);
+    clReleaseMemObject(static_cast<cl_mem>(ptr));
+}

diff --git a/src/runtime/CL/CLMultiHOG.cpp b/src/runtime/CL/CLMultiHOG.cpp
index b9e8739..88d45ac 100644
--- a/src/runtime/CL/CLMultiHOG.cpp
+++ b/src/runtime/CL/CLMultiHOG.cpp

@@ -25,12 +25,12 @@
 
 #include "arm_compute/core/CL/ICLHOG.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 CLMultiHOG::CLMultiHOG(size_t num_models)
-    : _num_models(num_models), _model(arm_compute::cpp14::make_unique<CLHOG[]>(_num_models))
+    : _num_models(num_models), _model(arm_compute::support::cpp14::make_unique<CLHOG[]>(_num_models))
 {
 }
 

diff --git a/src/runtime/CL/CLPyramid.cpp b/src/runtime/CL/CLPyramid.cpp
index 41d81ea..865f389 100644
--- a/src/runtime/CL/CLPyramid.cpp
+++ b/src/runtime/CL/CLPyramid.cpp

@@ -24,10 +24,10 @@
 #include "arm_compute/runtime/CL/CLPyramid.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PyramidInfo.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
+#include "support/ToolchainSupport.h"
 
 #include <array>
 #include <cmath>
@@ -52,7 +52,7 @@
 void CLPyramid::internal_init(const PyramidInfo &info, bool auto_padding)
 {
     _info    = info;
-    _pyramid = arm_compute::cpp14::make_unique<CLTensor[]>(_info.num_levels());
+    _pyramid = arm_compute::support::cpp14::make_unique<CLTensor[]>(_info.num_levels());
 
     size_t      w            = _info.width();
     size_t      h            = _info.height();

diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index fe25ce5..71a749f 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp

@@ -24,11 +24,12 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 #include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/runtime/CL/CLTuner.h"
 
 using namespace arm_compute;
 
 CLScheduler::CLScheduler()
-    : _context(), _queue(), _target(GPUTarget::MIDGARD)
+    : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner()
 {
 }
 
@@ -40,10 +41,22 @@
 
 void CLScheduler::enqueue(ICLKernel &kernel, bool flush)
 {
+    ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised,
+                             "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
+                             or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!");
+
+    // Tune the kernel if the CLTuner has been provided
+    if(_cl_tuner != nullptr)
+    {
+        // Tune the OpenCL kernel
+        _cl_tuner->tune_kernel(kernel);
+    }
+
+    // Run kernel
     kernel.run(kernel.window(), _queue);
 
     if(flush)
     {
         _queue.flush();
     }
-}
+}
\ No newline at end of file

diff --git a/src/runtime/CL/CLTensor.cpp b/src/runtime/CL/CLTensor.cpp
index eefa033..bc513d1 100644
--- a/src/runtime/CL/CLTensor.cpp
+++ b/src/runtime/CL/CLTensor.cpp

@@ -28,7 +28,7 @@
 using namespace arm_compute;
 
 CLTensor::CLTensor()
-    : _allocator()
+    : _allocator(this)
 {
 }
 

diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index 8112a71..ad165fa 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp

@@ -25,15 +25,21 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 using namespace arm_compute;
 
-CLTensorAllocator::CLTensorAllocator()
-    : _buffer(), _mapping(nullptr)
+CLTensorAllocator::CLTensorAllocator(CLTensor *owner)
+    : _associated_memory_group(nullptr), _buffer(), _mapping(nullptr), _owner(owner)
 {
 }
 
+CLTensorAllocator::~CLTensorAllocator()
+{
+    _buffer = cl::Buffer();
+}
+
 uint8_t *CLTensorAllocator::data()
 {
     return _mapping;
@@ -47,17 +53,32 @@
 void CLTensorAllocator::allocate()
 {
     ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
-
-    _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info().total_size());
+    if(_associated_memory_group == nullptr)
+    {
+        _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info().total_size());
+    }
+    else
+    {
+        _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(&_buffer()), info().total_size());
+    }
     info().set_is_resizable(false);
 }
 
 void CLTensorAllocator::free()
 {
-    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+    if(_associated_memory_group == nullptr)
+    {
+        _buffer = cl::Buffer();
+        info().set_is_resizable(true);
+    }
+}
 
-    _buffer = cl::Buffer();
-    info().set_is_resizable(true);
+void CLTensorAllocator::set_associated_memory_group(CLMemoryGroup *associated_memory_group)
+{
+    ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
+    ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr);
+    ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
+    _associated_memory_group = associated_memory_group;
 }
 
 uint8_t *CLTensorAllocator::lock()

diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
new file mode 100644
index 0000000..7f5be86
--- /dev/null
+++ b/src/runtime/CL/CLTuner.cpp

@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLTuner.h"
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <chrono>
+#include <limits>
+#include <string>
+
+using namespace arm_compute;
+
+CLTuner::CLTuner()
+    : _lws_table()
+{
+}
+
+void CLTuner::tune_kernel(ICLKernel &kernel)
+{
+    // Get the configuration ID from the kernel
+    const std::string &config_id = kernel.config_id();
+
+    // Check if we need to find the Optimal LWS. If config_id is equal to default_config_id, the kernel does not require to be tuned
+    if(config_id != arm_compute::default_config_id)
+    {
+        auto p = _lws_table.find(config_id);
+
+        if(p == _lws_table.end())
+        {
+            // Find the optimal LWS for the kernel
+            cl::NDRange opt_lws = find_optimal_lws(kernel);
+
+            // Insert the optimal LWS in the table
+            _lws_table.emplace(config_id, opt_lws);
+
+            // Set Local-Workgroup-Size
+            kernel.set_lws_hint(opt_lws);
+        }
+        else
+        {
+            // Set Local-Workgroup-Size
+            kernel.set_lws_hint(p->second);
+        }
+    }
+}
+
+cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+
+    double min_exec_time = std::numeric_limits<double>::max();
+
+    cl::NDRange opt_lws = cl::NDRange(1, 1);
+
+    for(int y = 1; y <= 16; ++y)
+    {
+        for(int x = 1; x <= 16; ++x)
+        {
+            cl::NDRange lws_test = cl::NDRange(x, y);
+
+            //Set the Local-Workgroup-Size
+            kernel.set_lws_hint(lws_test);
+
+            auto t_start = std::chrono::high_resolution_clock::now();
+
+            // Run
+            kernel.run(kernel.window(), q);
+
+            CLScheduler::get().sync();
+
+            auto t_stop = std::chrono::high_resolution_clock::now();
+
+            std::chrono::duration<double, std::nano> fp_nano = t_stop - t_start;
+
+            // Check the execution time
+            if(fp_nano.count() < min_exec_time)
+            {
+                min_exec_time = fp_nano.count();
+                opt_lws       = cl::NDRange(x, y);
+            }
+        }
+    }
+
+    return opt_lws;
+}
+
+void CLTuner::import_lws_table(const std::unordered_map<std::string, cl::NDRange> &lws_table)
+{
+    _lws_table.clear();
+    _lws_table = lws_table;
+}
+
+const std::unordered_map<std::string, cl::NDRange> &CLTuner::export_lws_table()
+{
+    return _lws_table;
+}
\ No newline at end of file

diff --git a/src/runtime/CL/ICLSimpleFunction.cpp b/src/runtime/CL/ICLSimpleFunction.cpp
index aa45743..a1a56fd 100644
--- a/src/runtime/CL/ICLSimpleFunction.cpp
+++ b/src/runtime/CL/ICLSimpleFunction.cpp

@@ -28,8 +28,9 @@
 
 using namespace arm_compute;
 
-ICLSimpleFunction::ICLSimpleFunction()
-    : _kernel(), _border_handler()
+ICLSimpleFunction::ICLSimpleFunction() // NOLINT
+    : _kernel(),
+      _border_handler()
 {
 }
 

diff --git a/src/runtime/CL/functions/CLAbsoluteDifference.cpp b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
index 5097dd4..5613e6c 100644
--- a/src/runtime/CL/functions/CLAbsoluteDifference.cpp
+++ b/src/runtime/CL/functions/CLAbsoluteDifference.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLAbsoluteDifference.h"
 
 #include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLAbsoluteDifference::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLAbsoluteDifferenceKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLAbsoluteDifferenceKernel>();
     k->configure(input1, input2, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/CL/functions/CLAccumulate.cpp b/src/runtime/CL/functions/CLAccumulate.cpp
index 56c5199..78f25fc 100644
--- a/src/runtime/CL/functions/CLAccumulate.cpp
+++ b/src/runtime/CL/functions/CLAccumulate.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLAccumulate.h"
 
 #include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,21 +32,21 @@
 
 void CLAccumulate::configure(const ICLTensor *input, ICLTensor *accum)
 {
-    auto k = arm_compute::cpp14::make_unique<CLAccumulateKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLAccumulateKernel>();
     k->configure(input, accum);
     _kernel = std::move(k);
 }
 
 void CLAccumulateWeighted::configure(const ICLTensor *input, float alpha, ICLTensor *accum)
 {
-    auto k = arm_compute::cpp14::make_unique<CLAccumulateWeightedKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLAccumulateWeightedKernel>();
     k->configure(input, alpha, accum);
     _kernel = std::move(k);
 }
 
 void CLAccumulateSquared::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum)
 {
-    auto k = arm_compute::cpp14::make_unique<CLAccumulateSquaredKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLAccumulateSquaredKernel>();
     k->configure(input, shift, accum);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index 9b5bd8b..b64739a 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp

@@ -24,13 +24,13 @@
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 
 #include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-void CLActivationLayer::configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
 {
-    auto k = arm_compute::cpp14::make_unique<CLActivationLayerKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerKernel>();
     k->configure(input, output, act_info);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/CL/functions/CLArithmeticAddition.cpp b/src/runtime/CL/functions/CLArithmeticAddition.cpp
index 36bff42..5ca384d 100644
--- a/src/runtime/CL/functions/CLArithmeticAddition.cpp
+++ b/src/runtime/CL/functions/CLArithmeticAddition.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLArithmeticAddition.h"
 
 #include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLArithmeticAddition::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
 {
-    auto k = arm_compute::cpp14::make_unique<CLArithmeticAdditionKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticAdditionKernel>();
     k->configure(input1, input2, output, policy);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
index 97f0a1c..651f51a 100644
--- a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
+++ b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
 
 #include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLArithmeticSubtraction::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
 {
-    auto k = arm_compute::cpp14::make_unique<CLArithmeticSubtractionKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticSubtractionKernel>();
     k->configure(input1, input2, output, policy);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
index 3df673c..68cdaac 100644
--- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp

@@ -37,7 +37,7 @@
 {
 }
 
-void CLBatchNormalizationLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon)
+void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon)
 {
     _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
 }

diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp
index 7c85043..f8a5a85 100644
--- a/src/runtime/CL/functions/CLBitwiseAnd.cpp
+++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
 
 #include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLBitwiseAnd::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLBitwiseAndKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLBitwiseAndKernel>();
     k->configure(input1, input2, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp
index 17ae5de..dc002e5 100644
--- a/src/runtime/CL/functions/CLBitwiseNot.cpp
+++ b/src/runtime/CL/functions/CLBitwiseNot.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLBitwiseNot.h"
 
 #include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLBitwiseNot::configure(const ICLTensor *input, ICLTensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLBitwiseNotKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLBitwiseNotKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp
index c84a279..4a10bb2 100644
--- a/src/runtime/CL/functions/CLBitwiseOr.cpp
+++ b/src/runtime/CL/functions/CLBitwiseOr.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLBitwiseOr.h"
 
 #include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLBitwiseOr::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLBitwiseOrKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLBitwiseOrKernel>();
     k->configure(input1, input2, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp
index fd49c7d..d23622a 100644
--- a/src/runtime/CL/functions/CLBitwiseXor.cpp
+++ b/src/runtime/CL/functions/CLBitwiseXor.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLBitwiseXor.h"
 
 #include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLBitwiseXor::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLBitwiseXorKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLBitwiseXorKernel>();
     k->configure(input1, input2, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/CL/functions/CLBox3x3.cpp b/src/runtime/CL/functions/CLBox3x3.cpp
index 8de6807..f28be44 100644
--- a/src/runtime/CL/functions/CLBox3x3.cpp
+++ b/src/runtime/CL/functions/CLBox3x3.cpp

@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLBox3x3.h"
 
 #include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,8 +33,8 @@
 
 void CLBox3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLBox3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLBox3x3Kernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }

diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
index 1d018b8..5acb8e7 100644
--- a/src/runtime/CL/functions/CLCannyEdge.cpp
+++ b/src/runtime/CL/functions/CLCannyEdge.cpp

@@ -26,17 +26,31 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
 #include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
 #include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-CLCannyEdge::CLCannyEdge()
-    : _sobel(nullptr), _gradient(), _border_mag_gradient(), _non_max_suppr(), _edge_trace(), _gx(), _gy(), _mag(), _phase(), _nonmax(), _visited(), _recorded(), _l1_list_counter(), _l1_stack()
+CLCannyEdge::CLCannyEdge(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _sobel(),
+      _gradient(),
+      _border_mag_gradient(),
+      _non_max_suppr(),
+      _edge_trace(),
+      _gx(),
+      _gy(),
+      _mag(),
+      _phase(),
+      _nonmax(),
+      _visited(),
+      _recorded(),
+      _l1_list_counter(),
+      _l1_stack()
 {
 }
 
@@ -83,22 +97,26 @@
     TensorInfo info_s32(shape_l1_stack, 1, arm_compute::DataType::S32);
     _l1_stack.allocator()->init(info_s32);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_gx);
+    _memory_group.manage(&_gy);
+
     // Configure/Init sobelNxN
     if(gradient_size == 3)
     {
-        auto k = arm_compute::cpp14::make_unique<CLSobel3x3>();
+        auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3>();
         k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
         _sobel = std::move(k);
     }
     else if(gradient_size == 5)
     {
-        auto k = arm_compute::cpp14::make_unique<CLSobel5x5>();
+        auto k = arm_compute::support::cpp14::make_unique<CLSobel5x5>();
         k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
         _sobel = std::move(k);
     }
     else if(gradient_size == 7)
     {
-        auto k = arm_compute::cpp14::make_unique<CLSobel7x7>();
+        auto k = arm_compute::support::cpp14::make_unique<CLSobel7x7>();
         k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
         _sobel = std::move(k);
     }
@@ -107,23 +125,43 @@
         ARM_COMPUTE_ERROR("Gradient %d size not supported", gradient_size);
     }
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_mag);
+    _memory_group.manage(&_phase);
+
     // Configure gradient
     _gradient.configure(&_gx, &_gy, &_mag, &_phase, norm_type);
 
+    // Allocate intermediate buffers
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+
+    // Manage intermediate buffers
+    _memory_group.manage(&_nonmax);
+
     // Configure non-maxima suppression
     _non_max_suppr.configure(&_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED);
 
+    // Allocate intermediate buffers
+    _phase.allocator()->allocate();
+
     // Fill border around magnitude image as non-maxima suppression will access
     // it. If border mode is undefined filling the border is a nop.
     _border_mag_gradient.configure(&_mag, _non_max_suppr.border_size(), border_mode, constant_border_value);
 
+    // Allocate intermediate buffers
+    _mag.allocator()->allocate();
+
+    // Manage intermediate buffers
+    _memory_group.manage(&_visited);
+    _memory_group.manage(&_recorded);
+    _memory_group.manage(&_l1_stack);
+    _memory_group.manage(&_l1_list_counter);
+
     // Configure edge tracing
     _edge_trace.configure(&_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter);
 
-    _gx.allocator()->allocate();
-    _gy.allocator()->allocate();
-    _phase.allocator()->allocate();
-    _mag.allocator()->allocate();
+    // Allocate intermediate buffers
     _visited.allocator()->allocate();
     _recorded.allocator()->allocate();
     _l1_stack.allocator()->allocate();
@@ -133,6 +171,8 @@
 
 void CLCannyEdge::run()
 {
+    _memory_group.acquire();
+
     // Run sobel
     _sobel->run();
 
@@ -152,4 +192,6 @@
     _l1_list_counter.clear(CLScheduler::get().queue());
     _l1_stack.clear(CLScheduler::get().queue());
     CLScheduler::get().enqueue(_edge_trace, true);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLChannelCombine.cpp b/src/runtime/CL/functions/CLChannelCombine.cpp
index 79a3676..11605cf 100644
--- a/src/runtime/CL/functions/CLChannelCombine.cpp
+++ b/src/runtime/CL/functions/CLChannelCombine.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLChannelCombine.h"
 
 #include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,14 +32,14 @@
 
 void CLChannelCombine::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLChannelCombineKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLChannelCombineKernel>();
     k->configure(plane0, plane1, plane2, plane3, output);
     _kernel = std::move(k);
 }
 
 void CLChannelCombine::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLChannelCombineKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLChannelCombineKernel>();
     k->configure(plane0, plane1, plane2, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/CL/functions/CLChannelExtract.cpp b/src/runtime/CL/functions/CLChannelExtract.cpp
index 2c6174b..5090382 100644
--- a/src/runtime/CL/functions/CLChannelExtract.cpp
+++ b/src/runtime/CL/functions/CLChannelExtract.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLChannelExtract.h"
 
 #include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,14 +32,14 @@
 
 void CLChannelExtract::configure(const ICLTensor *input, Channel channel, ICLTensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLChannelExtractKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLChannelExtractKernel>();
     k->configure(input, channel, output);
     _kernel = std::move(k);
 }
 
 void CLChannelExtract::configure(const ICLMultiImage *input, Channel channel, ICLImage *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLChannelExtractKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLChannelExtractKernel>();
     k->configure(input, channel, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/CL/functions/CLColorConvert.cpp b/src/runtime/CL/functions/CLColorConvert.cpp
index 2fe465a..65f8ac3 100644
--- a/src/runtime/CL/functions/CLColorConvert.cpp
+++ b/src/runtime/CL/functions/CLColorConvert.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLColorConvert.h"
 
 #include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,28 +32,28 @@
 
 void CLColorConvert::configure(const ICLTensor *input, ICLTensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
 
 void CLColorConvert::configure(const ICLImage *input, ICLMultiImage *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
 
 void CLColorConvert::configure(const ICLMultiImage *input, ICLImage *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
 
 void CLColorConvert::configure(const ICLMultiImage *input, ICLMultiImage *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp
index 21b5d47..a9b0867 100644
--- a/src/runtime/CL/functions/CLConvolution.cpp
+++ b/src/runtime/CL/functions/CLConvolution.cpp

@@ -26,13 +26,13 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -40,15 +40,15 @@
 
 void CLConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLConvolution3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLConvolution3x3Kernel>();
     k->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
 
 template <unsigned int matrix_size>
-CLConvolutionSquare<matrix_size>::CLConvolutionSquare()
-    : _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
+CLConvolutionSquare<matrix_size>::CLConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
 {
 }
 
@@ -66,6 +66,9 @@
         std::pair<DataType, DataType> type_pair = data_type_for_convolution(conv_col, conv_row, matrix_size);
         _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, type_pair.first));
 
+        // Manage intermediate buffers
+        _memory_group.manage(&_tmp);
+
         if(scale == 0)
         {
             scale = calculate_matrix_scale(conv, matrix_size);
@@ -92,8 +95,12 @@
 
     if(_is_separable)
     {
+        _memory_group.acquire();
+
         CLScheduler::get().enqueue(_kernel_hor, false);
         CLScheduler::get().enqueue(_kernel_vert);
+
+        _memory_group.release();
     }
     else
     {
@@ -107,7 +114,7 @@
 
 void CLConvolutionRectangle::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLConvolutionRectangleKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLConvolutionRectangleKernel>();
     k->configure(input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));

diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index f0bbc35..4b1bfd8 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp

@@ -24,32 +24,31 @@
 #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
 
 #include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 #include <cmath>
+#include <memory>
 #include <tuple>
 
 using namespace arm_compute;
 
-CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights()
-    : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
+CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
 {
 }
 
 void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose1xW)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
 
     if(biases != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
         ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
         ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
@@ -65,10 +64,12 @@
         const unsigned int mat_weights_cols = weights->info()->dimension(3);
         const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
         TensorShape        shape_wr(mat_weights_cols, mat_weights_rows);
-        const DataType     dt = weights->info()->data_type();
-        TensorInfo         info_wr(shape_wr, 1, dt);
+        const DataType     dt                   = weights->info()->data_type();
+        const int          fixed_point_position = weights->info()->fixed_point_position();
+        TensorInfo         info_wr(shape_wr, 1, dt, fixed_point_position);
 
         _weights_reshaped.allocator()->init(info_wr);
+        _memory_group.manage(&_weights_reshaped);
         _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
         _weights_transposed_kernel.configure(&_weights_reshaped, output);
         _weights_reshaped.allocator()->allocate();
@@ -81,41 +82,50 @@
 
 void CLConvolutionLayerReshapeWeights::run()
 {
+    _memory_group.acquire();
+
     cl::CommandQueue q = CLScheduler::get().queue();
     CLScheduler::get().enqueue(_weights_reshape_kernel);
     if(_transpose1xW)
     {
         CLScheduler::get().enqueue(_weights_transposed_kernel);
     }
+
+    _memory_group.release();
 }
 
-CLConvolutionLayer::CLConvolutionLayer()
-    : _reshape_weights(), _input_im2col_kernel(), _input_interleave_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(),
-      _weights_transposed(), _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reshape_weights(), _input_im2col_kernel(), _input_interleave_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(),
+      _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
 {
 }
 
 void CLConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
     ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
 
     if(biases != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
         ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
         ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
     }
 
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    // Set the GPU target for matrix multiply
+    _mm_kernel.set_target(CLScheduler::get().target());
+
     _has_bias             = (biases != nullptr);
     _are_weights_reshaped = weights_info.are_reshaped();
 
-    // Get parameters for conv_info
+    // Get parameters from conv_info
     unsigned int stride_x = 0;
     unsigned int stride_y = 0;
     unsigned int pad_x    = 0;
@@ -127,20 +137,21 @@
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
 
-    const unsigned int kernel_width = _are_weights_reshaped ? weights_info.kernel_size() : weights->info()->dimension(0);
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width,
-                                                 stride_x, stride_y, pad_x, pad_y, conv_info.round());
-    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+    const unsigned int kernel_width  = (_are_weights_reshaped) ? weights_info.kernel_size().first : weights->info()->dimension(0);
+    const unsigned int kernel_height = (_are_weights_reshaped) ? weights_info.kernel_size().second : weights->info()->dimension(1);
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
+                                                 conv_info);
 
     // Check if its a "fully connected" convolution
     _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
 
-    // Create tensor to store the reshaped weights
-    size_t mat_weights_cols = weights->info()->dimension(3);
-    size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
+    unsigned int mat_weights_cols = weights->info()->dimension(3);
+    unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+
+    // Reshape weights if needed
     if(_are_weights_reshaped)
     {
-        mat_weights_cols                         = output->info()->dimension(2);
+        mat_weights_cols                         = weights_info.num_kernels();
         const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
         mat_weights_rows                         = (_has_bias ? 1 + quarter_reshaped_cols : quarter_reshaped_cols);
     }
@@ -150,77 +161,75 @@
         {
             // Create tensor to store the reshaped weights
             TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
-            TensorInfo  info_wr(shape_wr, 1, weights->info()->data_type());
+            TensorInfo  info_wr(shape_wr, 1, dt, fixed_point_position);
             _weights_reshaped.allocator()->init(info_wr);
-            _reshape_weights.configure(weights, biases, &_weights_reshaped, false);
-            weights = &_weights_reshaped;
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
         }
         else
         {
             // Create tensor to store transposed weights
-            TensorShape shape_wt(mat_weights_rows * 4, static_cast<size_t>(std::ceil(mat_weights_cols / 4.f)));
-            TensorInfo  info_wt(shape_wt, 1, weights->info()->data_type());
-            _weights_transposed.allocator()->init(info_wt);
-            _reshape_weights.configure(weights, biases, &_weights_transposed, true);
-            weights = &_weights_transposed;
+            const float transpose_width = 16.0f / input->info()->element_size();
+            TensorShape shape_wt(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
+            TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+            _weights_reshaped.allocator()->init(info_wt);
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, true /* 1xW transpose */);
         }
+        weights = &_weights_reshaped;
     }
+
     // Create tensor to store im2col reshaped inputs
-    const size_t mat_input_cols = mat_weights_rows;
-    const size_t mat_input_rows = conv_w * conv_h;
-    TensorShape  shape_im2col   = input->info()->tensor_shape();
+    const unsigned int mat_input_cols = mat_weights_rows;
+    const unsigned int mat_input_rows = conv_w * conv_h;
+    TensorShape        shape_im2col   = input->info()->tensor_shape();
     shape_im2col.set(0, mat_input_cols);
     shape_im2col.set(1, mat_input_rows);
     shape_im2col.set(2, 1);
-    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+    _memory_group.manage(&_input_im2col_reshaped);
 
     // Create tensor (interleave) to prepare input tensor for GEMM
     if(!_is_fully_connected_convolution)
     {
         TensorShape shape_interleaved = shape_im2col;
         shape_interleaved.set(0, shape_interleaved.x() * 4);
-        shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4.f));
-        _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
+        shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
+        _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+        _memory_group.manage(&_input_interleaved_reshaped);
     }
 
     // Create GEMM output tensor
     TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
     shape_gemm.set(0, mat_weights_cols);
     shape_gemm.set(1, mat_input_rows);
-    _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
+    _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, dt, fixed_point_position));
+    _memory_group.manage(&_gemm_output);
 
     // Configure kernels
-    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
-    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
 
+    // Configure matrix multiply
     if(_is_fully_connected_convolution)
     {
-        _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
+        // The matrix A and Matrix B have not been reshaped
+        _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f, false);
     }
     else
     {
         _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
         _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
-    }
-
-    if(!_are_weights_reshaped)
-    {
-        if(!_is_fully_connected_convolution)
-        {
-            _weights_transposed.allocator()->allocate();
-        }
-        else
-        {
-            _weights_reshaped.allocator()->allocate();
-        }
-    }
-
-    _input_im2col_reshaped.allocator()->allocate();
-    if(!_is_fully_connected_convolution)
-    {
         _input_interleaved_reshaped.allocator()->allocate();
     }
+    _input_im2col_reshaped.allocator()->allocate();
+    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
     _gemm_output.allocator()->allocate();
+
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+
+    // Allocate intermediate tensor
+    if(!_are_weights_reshaped)
+    {
+        _weights_reshaped.allocator()->allocate();
+    }
 }
 
 void CLConvolutionLayer::run()
@@ -232,6 +241,8 @@
         _reshape_weights.run();
     }
 
+    _memory_group.acquire();
+
     // Run input reshaping
     CLScheduler::get().enqueue(_input_im2col_kernel);
     if(!_is_fully_connected_convolution)
@@ -244,4 +255,6 @@
 
     // Reshape output matrix
     CLScheduler::get().enqueue(_output_col2im_kernel, false);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLDepthConcatenate.cpp b/src/runtime/CL/functions/CLDepthConcatenate.cpp
index d967d98..f42627f 100644
--- a/src/runtime/CL/functions/CLDepthConcatenate.cpp
+++ b/src/runtime/CL/functions/CLDepthConcatenate.cpp

@@ -24,22 +24,23 @@
 #include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-CLDepthConcatenate::CLDepthConcatenate()
-    : _inputs_vector(), _concat_kernels_vector(), _border_handlers_vector(), _num_inputs(0)
+CLDepthConcatenate::CLDepthConcatenate() // NOLINT
+    : _inputs_vector(),
+      _concat_kernels_vector(),
+      _border_handlers_vector(),
+      _num_inputs(0)
 {
 }
 
-void CLDepthConcatenate::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output)
+void CLDepthConcatenate::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output) // NOLINT
 {
     ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
 
@@ -47,8 +48,8 @@
 
     unsigned int depth_offset = 0;
 
-    _concat_kernels_vector  = arm_compute::cpp14::make_unique<CLDepthConcatenateKernel[]>(_num_inputs);
-    _border_handlers_vector = arm_compute::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
+    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<CLDepthConcatenateKernel[]>(_num_inputs);
+    _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
 
     for(unsigned int i = 0; i < _num_inputs; i++)
     {

diff --git a/src/runtime/CL/functions/CLDepthConvert.cpp b/src/runtime/CL/functions/CLDepthConvert.cpp
index edcd492..b64d05b 100644
--- a/src/runtime/CL/functions/CLDepthConvert.cpp
+++ b/src/runtime/CL/functions/CLDepthConvert.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
 
 #include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLDepthConvert::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
 {
-    auto k = arm_compute::cpp14::make_unique<CLDepthConvertKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertKernel>();
     k->configure(input, output, policy, shift);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/CL/functions/CLDepthwiseConvolution.cpp b/src/runtime/CL/functions/CLDepthwiseConvolution.cpp
new file mode 100644
index 0000000..22c037f
--- /dev/null
+++ b/src/runtime/CL/functions/CLDepthwiseConvolution.cpp

@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthwiseConvolution.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLDepthwiseConvolution3x3::CLDepthwiseConvolution3x3()
+    : _kernel(), _border_handler()
+{
+}
+
+void CLDepthwiseConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *weights, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+
+    _kernel.configure(input, output, weights, conv_info);
+    _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+}
+
+void CLDepthwiseConvolution3x3::run()
+{
+    CLScheduler::get().enqueue(_border_handler);
+    CLScheduler::get().enqueue(_kernel);
+}
+
+CLDepthwiseConvolution::CLDepthwiseConvolution()
+    : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(), _weights_reshaped(),
+      _v2mm_output()
+{
+}
+
+void CLDepthwiseConvolution::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *weights, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
+
+    const size_t weights_w = weights->info()->dimension(0);
+    const size_t weights_h = weights->info()->dimension(1);
+    const size_t weights_z = weights->info()->dimension(2);
+
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights_w, weights_h, conv_info);
+
+    // Set up intermediate tensors
+    const size_t patch_size = weights_w * weights_h;
+    const size_t conv_size  = conv_w * conv_h;
+
+    TensorShape shape_im2col = input->info()->tensor_shape();
+    shape_im2col.set(0, patch_size);
+    shape_im2col.set(1, conv_size);
+    shape_im2col.set(2, weights_z);
+
+    const TensorShape shape_weights_reshape(patch_size, weights_z);
+    TensorShape       shape_v2mm_out = output->info()->tensor_shape();
+    shape_v2mm_out.set(0, conv_size * weights_z);
+    shape_v2mm_out.set(1, 1);
+    shape_v2mm_out.set(2, 1);
+
+    const TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    const TensorInfo info_weights_reshape(shape_weights_reshape, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
+    const TensorInfo info_v2mm_out(shape_v2mm_out, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    _input_reshaped.allocator()->init(info_im2col);
+    _weights_reshaped.allocator()->init(info_weights_reshape);
+    _v2mm_output.allocator()->init(info_v2mm_out);
+
+    // Configure kernels
+    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info);
+    _weights_reshape_kernel.configure(weights, &_weights_reshaped);
+    _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
+    _vector_to_tensor_kernel.configure(&_v2mm_output, output, conv_w, conv_h);
+
+    BorderSize border_size = _v2mm_kernel.border_size();
+    _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, PixelValue(0));
+
+    border_size.bottom = 0;
+    _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, PixelValue(0));
+
+    // Allocate intermediate tensors
+    _input_reshaped.allocator()->allocate();
+    _weights_reshaped.allocator()->allocate();
+    _v2mm_output.allocator()->allocate();
+}
+
+void CLDepthwiseConvolution::run()
+{
+    CLScheduler::get().enqueue(_im2col_kernel);
+
+    CLScheduler::get().enqueue(_weights_reshape_kernel);
+
+    CLScheduler::get().enqueue(_v2mm_input_fill_border);
+    CLScheduler::get().enqueue(_v2mm_weights_fill_border);
+    CLScheduler::get().enqueue(_v2mm_kernel);
+
+    CLScheduler::get().enqueue(_vector_to_tensor_kernel);
+}

diff --git a/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
new file mode 100644
index 0000000..c325b3e
--- /dev/null
+++ b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp

@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLDepthwiseSeparableConvolutionLayer::CLDepthwiseSeparableConvolutionLayer()
+    : _depthwise_conv(), _pointwise_conv()
+{
+}
+
+void CLDepthwiseSeparableConvolutionLayer::configure(ICLTensor *input, const ICLTensor *depthwise_weights, ICLTensor *depthwise_out, const ICLTensor *pointwise_weights, const ICLTensor *biases,
+                                                     ICLTensor           *output,
+                                                     const PadStrideInfo &depthwise_conv_info, const PadStrideInfo &pointwise_conv_info)
+{
+    _depthwise_conv.configure(input, depthwise_out, depthwise_weights, depthwise_conv_info);
+    _pointwise_conv.configure(depthwise_out, pointwise_weights, biases, output, pointwise_conv_info);
+}
+
+void CLDepthwiseSeparableConvolutionLayer::run()
+{
+    _depthwise_conv.run();
+    _pointwise_conv.run();
+}
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
new file mode 100644
index 0000000..5559d42
--- /dev/null
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp

@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLDequantizationLayer::CLDequantizationLayer()
+    : _dequantize_kernel()
+{
+}
+
+void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *min_max)
+{
+    _dequantize_kernel.configure(input, output, min_max);
+}
+
+void CLDequantizationLayer::run()
+{
+    // Run dequantization kernel
+    CLScheduler::get().enqueue(_dequantize_kernel, false);
+}

diff --git a/src/runtime/CL/functions/CLDerivative.cpp b/src/runtime/CL/functions/CLDerivative.cpp
index c51cb4c..ae49996 100644
--- a/src/runtime/CL/functions/CLDerivative.cpp
+++ b/src/runtime/CL/functions/CLDerivative.cpp

@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLDerivative.h"
 
 #include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,8 +33,8 @@
 
 void CLDerivative::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLDerivativeKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLDerivativeKernel>();
     k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }

diff --git a/src/runtime/CL/functions/CLDilate.cpp b/src/runtime/CL/functions/CLDilate.cpp
index 345f477..59c5ea5 100644
--- a/src/runtime/CL/functions/CLDilate.cpp
+++ b/src/runtime/CL/functions/CLDilate.cpp

@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLDilate.h"
 
 #include "arm_compute/core/CL/kernels/CLDilateKernel.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,8 +33,8 @@
 
 void CLDilate::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLDilateKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLDilateKernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }

diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
new file mode 100644
index 0000000..6fafd9c
--- /dev/null
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp

@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLDirectConvolutionLayer::CLDirectConvolutionLayer()
+    : _direct_conv_kernel(), _input_border_handler()
+{
+}
+
+void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+    // Set GPU target
+    _direct_conv_kernel.set_target(CLScheduler::get().target());
+
+    // Configure direct convolution
+    _direct_conv_kernel.configure(input, weights, biases, output, conv_info);
+
+    // Configure border handler
+    _input_border_handler.configure(input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+}
+
+void CLDirectConvolutionLayer::run()
+{
+    // Run border handler
+    CLScheduler::get().enqueue(_input_border_handler, false);
+
+    // Run direct convolution
+    CLScheduler::get().enqueue(_direct_conv_kernel);
+}

diff --git a/src/runtime/CL/functions/CLErode.cpp b/src/runtime/CL/functions/CLErode.cpp
index b4c50e4..eb1f6e4 100644
--- a/src/runtime/CL/functions/CLErode.cpp
+++ b/src/runtime/CL/functions/CLErode.cpp

@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLErode.h"
 
 #include "arm_compute/core/CL/kernels/CLErodeKernel.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,8 +33,8 @@
 
 void CLErode::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLErodeKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLErodeKernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }

diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp
index d2903fb..7a0dd09 100644
--- a/src/runtime/CL/functions/CLFastCorners.cpp
+++ b/src/runtime/CL/functions/CLFastCorners.cpp

@@ -36,8 +36,9 @@
 
 using namespace arm_compute;
 
-CLFastCorners::CLFastCorners()
-    : _fast_corners_kernel(),
+CLFastCorners::CLFastCorners(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)),
+      _fast_corners_kernel(),
       _suppr_func(),
       _copy_array_kernel(),
       _output(),
@@ -70,6 +71,7 @@
 
     const bool update_number = (nullptr != _num_corners);
 
+    _memory_group.manage(&_output);
     _fast_corners_kernel.configure(input, &_output, threshold, nonmax_suppression, border_mode);
 
     if(!_non_max)
@@ -79,6 +81,7 @@
     else
     {
         _suppr.allocator()->init(tensor_info);
+        _memory_group.manage(&_suppr);
 
         _suppr_func.configure(&_output, &_suppr, border_mode);
         _copy_array_kernel.configure(&_suppr, update_number, corners, &_num_buffer);
@@ -94,6 +97,8 @@
 {
     cl::CommandQueue q = CLScheduler::get().queue();
 
+    _memory_group.acquire();
+
     if(_non_max)
     {
         ARM_COMPUTE_ERROR_ON_MSG(_output.cl_buffer().get() == nullptr, "Unconfigured function");
@@ -124,4 +129,6 @@
     }
 
     q.flush();
+
+    _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLFillBorder.cpp b/src/runtime/CL/functions/CLFillBorder.cpp
index 9e59b77..54c096e 100644
--- a/src/runtime/CL/functions/CLFillBorder.cpp
+++ b/src/runtime/CL/functions/CLFillBorder.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLFillBorder.h"
 
 #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLFillBorder::configure(ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLFillBorderKernel>();
-    k->configure(tensor, border_width, border_mode, constant_border_value);
+    auto k = arm_compute::support::cpp14::make_unique<CLFillBorderKernel>();
+    k->configure(tensor, BorderSize(border_width), border_mode, constant_border_value);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp
new file mode 100644
index 0000000..9f571b2
--- /dev/null
+++ b/src/runtime/CL/functions/CLFlattenLayer.cpp

@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
+#include "arm_compute/core/Size2D.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLFlattenLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLIm2ColKernel>();
+    k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+    _kernel = std::move(k);
+}

diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp
new file mode 100644
index 0000000..364db34
--- /dev/null
+++ b/src/runtime/CL/functions/CLFloor.cpp

@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFloor.h"
+
+#include "arm_compute/core/CL/kernels/CLFloorKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLFloor::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLFloorKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}

diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 57d57d5..ee1558f 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp

@@ -23,88 +23,31 @@
  */
 #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
 
+#include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
 
 #include <algorithm>
-#include <cmath>
 
 using namespace arm_compute;
 
-CLFullyConnectedLayerReshapeWeights::CLFullyConnectedLayerReshapeWeights()
-    : _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
+void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _reshape_weights_output(),
+      _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false)
 {
 }
 
-void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output, bool transpose_weights, bool is_batched_fc_layer)
+void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(output == nullptr);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != 2);
-    ARM_COMPUTE_ERROR_ON((transpose_weights == false) && (is_batched_fc_layer == false));
-
-    const DataType dt                   = input->info()->data_type();
-    const int      fixed_point_position = input->info()->fixed_point_position();
-
-    _transpose_weights   = transpose_weights;
-    _is_batched_fc_layer = is_batched_fc_layer;
-
-    // Check if we need to transpose the weights
-    if(_transpose_weights)
-    {
-        if(_is_batched_fc_layer)
-        {
-            // Initialize the output tensor for transpose
-            TensorShape shape_transposed(input->info()->dimension(1), input->info()->dimension(0));
-            _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, dt, fixed_point_position));
-            _transpose_kernel.configure(input, &_transpose_output);
-
-            // Configure transpose 1xW kernel
-            _transpose1xW_kernel.configure(&_transpose_output, output);
-
-            // Allocate temporary tensor used for transposing the weights
-            _transpose_output.allocator()->allocate();
-        }
-        else
-        {
-            _transpose_kernel.configure(input, output);
-        }
-    }
-    else
-    {
-        if(_is_batched_fc_layer)
-        {
-            // Configure transpose 1xW kernel
-            _transpose1xW_kernel.configure(input, output);
-        }
-        else
-        {
-            ARM_COMPUTE_ERROR("Configuration transpose_weights=false & is_batched_fc_layer=false not supported");
-        }
-    }
-}
-
-void CLFullyConnectedLayerReshapeWeights::run()
-{
-    if(_transpose_weights)
-    {
-        CLScheduler::get().enqueue(_transpose_kernel, _is_batched_fc_layer);
-    }
-    if(_is_batched_fc_layer)
-    {
-        CLScheduler::get().enqueue(_transpose1xW_kernel);
-    }
-}
-
-CLFullyConnectedLayer::CLFullyConnectedLayer()
-    : _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _reshape_weights_output(),
-      _are_weights_reshaped(true), _is_fc_after_conv(true), _is_batched_fc_layer(false), _accumulate_biases(false)
-{
-}
-
-void CLFullyConnectedLayer::configure_conv_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2) * (16 / weights->info()->element_size())));
+    ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
 
     const DataType dt                   = input->info()->data_type();
     const int      fixed_point_position = input->info()->fixed_point_position();
@@ -119,93 +62,33 @@
     shape_im2col.set(3, input->info()->dimension(5));
     _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
 
-    // Initialize output tensor for interleave 4x4
-    TensorShape shape_interleaved = _im2col_output.info()->tensor_shape();
-    shape_interleaved.set(0, shape_interleaved.x() * 4);
-    shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
-    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
-
     // Configure im2col kernel
-    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
-
-    // Configure interleave4x4 kernel
-    _interleave4x4_kernel.configure(&_im2col_output, &_interleave4x4_output);
+    _memory_group.manage(&_im2col_output);
+    _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
 
     // Configure matrix multiply kernel
-    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
-
-    // Allocate the tensors once all the configure methods have been called
-    _im2col_output.allocator()->allocate();
-    _interleave4x4_output.allocator()->allocate();
-}
-
-void CLFullyConnectedLayer::configure_fc_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
-{
-    const DataType dt                   = input->info()->data_type();
-    const int      fixed_point_position = input->info()->fixed_point_position();
-
-    // Initialize output tensor for interleave 4x4
-    TensorShape shape_interleaved = input->info()->tensor_shape();
-    shape_interleaved.set(0, shape_interleaved.x() * 4);
-    shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
-    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
-
-    // Configure interleave4x4 kernel
-    _interleave4x4_kernel.configure(input, &_interleave4x4_output);
-
-    // Configure matrix multiply kernel
-    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
-
-    // Allocate the tensors once all the configure methods have been called
-    _interleave4x4_output.allocator()->allocate();
-}
-
-void CLFullyConnectedLayer::configure_conv_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
-
-    const DataType dt                   = input->info()->data_type();
-    const int      fixed_point_position = input->info()->fixed_point_position();
-
-    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
-
-    // Initialize output tensor for im2col
-    TensorShape shape_im2col;
-    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
-    shape_im2col.set(1, 1);
-    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
-
-    // Configure im2col kernel
-    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
-
-    // Configure matrix multiply kernel
-    _mm_kernel.configure(&_im2col_output, weights, output, 1.0f);
+    _mm_kernel.configure(&_im2col_output, weights, output, 1.0f, false);
 
     // Allocate the output tensor for im2col once all the configure methods have been called
     _im2col_output.allocator()->allocate();
 }
 
-void CLFullyConnectedLayer::configure_fc_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
+void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
 {
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
 
     // Configure matrix multiply kernel
-    _mm_kernel.configure(input, weights, output, 1.0f);
+    _mm_kernel.configure(input, weights, output, 1.0f, false);
 }
 
 void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights, bool are_weights_reshaped)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
 
-    const DataType dt                   = input->info()->data_type();
-    const int      fixed_point_position = input->info()->fixed_point_position();
-
-    _are_weights_reshaped = are_weights_reshaped;
+    _are_weights_reshaped = transpose_weights ? are_weights_reshaped : true;
     _is_fc_after_conv     = true;
-    _is_batched_fc_layer  = false;
     _accumulate_biases    = false;
 
     if(biases != nullptr)
@@ -224,90 +107,46 @@
     //  3) Convolution layer -> Fully Connected layer with batches
     //  4) Fully Connected layer -> Fully Connected layer with batches
 
-    // Check if we have a fully connected layer with batches
-    _is_batched_fc_layer = (output->info()->dimension(1) > 1);
-
     const ICLTensor *weights_to_use = weights;
 
-    if(!are_weights_reshaped)
+    if(!_are_weights_reshaped)
     {
-        if((transpose_weights || _is_batched_fc_layer))
-        {
-            weights_to_use = &_reshape_weights_output;
+        weights_to_use = &_reshape_weights_output;
 
-            if(transpose_weights)
-            {
-                if(_is_batched_fc_layer)
-                {
-                    const float transpose_width = 16.0f / input->info()->element_size();
-                    TensorShape shape_wt(weights->info()->dimension(0) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(1) / transpose_width)));
-                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
-                    _reshape_weights_output.allocator()->init(info_wt);
-                }
-                else
-                {
-                    TensorShape shape_wt(weights->info()->dimension(1), weights->info()->dimension(0));
-                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
-                    _reshape_weights_output.allocator()->init(info_wt);
-                }
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR_ON(!_is_batched_fc_layer);
-
-                const float transpose_width = 16.0f / input->info()->element_size();
-                TensorShape shape_wt(weights->info()->dimension(1) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(0) / transpose_width)));
-                TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
-                _reshape_weights_output.allocator()->init(info_wt);
-            }
-
-            // Reshape the weights
-            _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
-        }
+        // Reshape the weights
+        _reshape_weights_kernel.configure(weights, &_reshape_weights_output);
     }
 
-    if(_is_batched_fc_layer)
+    // Check if we have a fully connected layer with batches
+    const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
+
+    if(is_batched_fc_layer)
     {
         _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
                                                                                   input->info()->tensor_shape().cend(),
                                                                                   output->info()->tensor_shape().cbegin() + 1));
-
-        if(_is_fc_after_conv)
-        {
-            // Fully Connected layer after a Convolution Layer with batches
-            configure_conv_fc_wb(input, weights_to_use, output);
-        }
-        else
-        {
-            // Fully Connected layer after a Fully Connected Layer with batches
-            configure_fc_fc_wb(input, weights_to_use, output);
-        }
     }
     else
     {
-        // In case of not batched fully connected layer, the weights will not be reshaped using transposed1xW
-        _is_fc_after_conv = ((weights_to_use->info()->dimension(1)) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+        _is_fc_after_conv = input->info()->num_dimensions() > 1;
+    }
 
-        if(_is_fc_after_conv)
-        {
-            // Fully Connected layer after a Convolution Layer without batches
-            configure_conv_fc_nb(input, weights_to_use, output);
-        }
-        else
-        {
-            // Fully Connected layer after a Fully Connected Layer without batches
-            configure_fc_fc_nb(input, weights_to_use, output);
-        }
+    if(_is_fc_after_conv)
+    {
+        // Fully Connected layer after a Convolution Layer without batches
+        configure_conv_fc(input, weights_to_use, output);
+    }
+    else
+    {
+        // Fully Connected layer after a Fully Connected Layer without batches
+        configure_fc_fc(input, weights_to_use, output);
     }
 
     // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
-    if(!are_weights_reshaped)
+    if(!_are_weights_reshaped)
     {
-        if(transpose_weights || _is_batched_fc_layer)
-        {
-            // Allocate the tensor for the weights reshaped
-            _reshape_weights_output.allocator()->allocate();
-        }
+        // Allocate the tensor for the weights reshaped
+        _reshape_weights_output.allocator()->allocate();
     }
 }
 
@@ -320,18 +159,14 @@
         _reshape_weights_kernel.run();
     }
 
+    _memory_group.acquire();
+
     // Linearize input if it comes from a convolutional layer
     if(_is_fc_after_conv)
     {
         CLScheduler::get().enqueue(_im2col_kernel, false);
     }
 
-    // Interleave input
-    if(_is_batched_fc_layer)
-    {
-        CLScheduler::get().enqueue(_interleave4x4_kernel, false);
-    }
-
     // Run matrix multiply
     CLScheduler::get().enqueue(_mm_kernel, !_accumulate_biases);
 
@@ -340,4 +175,6 @@
     {
         CLScheduler::get().enqueue(_accumulate_biases_kernel);
     }
+
+    _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 7408054..a81d113 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp

@@ -38,20 +38,18 @@
 
 using namespace arm_compute;
 
-CLGEMM::CLGEMM()
-    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _run_vector_matrix_multiplication(false), _run_addition(false)
+CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false)
 {
 }
 
 void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
 
     if(c != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16);
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
         ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
         ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
@@ -59,13 +57,18 @@
         ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
     }
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
     ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
 
-    // Check if the first input tensor is a vector. If so, all the kernels for reshaping the tensors can be skipped
-    if(a->info()->dimension(1) != 1)
+    // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors
+    _is_interleaved_transposed = a->info()->dimension(1) > 16;
+
+    const ICLTensor *matrix_a = a;
+    const ICLTensor *matrix_b = b;
+
+    if(_is_interleaved_transposed)
     {
-        _run_vector_matrix_multiplication = false;
+        matrix_a = &_tmp_a;
+        matrix_b = &_tmp_b;
 
         TensorShape shape_tmp_a = a->info()->tensor_shape();
         TensorShape shape_tmp_b = b->info()->tensor_shape();
@@ -73,27 +76,20 @@
         shape_tmp_a.set(0, a->info()->dimension(0) * 4);
         shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
 
-        if(DataType::F32 == a->info()->data_type())
-        {
-            shape_tmp_b.set(0, b->info()->dimension(1) * 4);
-            shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 4.0f));
-        }
-        else if(DataType::F16 == a->info()->data_type())
-        {
-            shape_tmp_b.set(0, b->info()->dimension(1) * 8);
-            shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 8.0f));
-        }
-        else
-        {
-            ARM_COMPUTE_ERROR("DataType not supported");
-        }
+        const unsigned int transpose_w = max_cl_vector_width / data_size_from_type(b->info()->data_type());
+        shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
+        shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
 
-        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
         _tmp_a.allocator()->init(info_a);
 
-        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), b->info()->fixed_point_position());
         _tmp_b.allocator()->init(info_b);
 
+        // Manage intermediate buffers
+        _memory_group.manage(&_tmp_a);
+        _memory_group.manage(&_tmp_b);
+
         // Configure interleave kernel
         _interleave_kernel.configure(a, &_tmp_a);
 
@@ -101,19 +97,17 @@
         _transpose_kernel.configure(b, &_tmp_b);
 
         // Configure matrix multiply kernel
-        _mm_kernel.configure(&_tmp_a, &_tmp_b, output, alpha);
+        _mm_kernel.set_target(CLScheduler::get().target());
+    }
 
+    _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed);
+
+    if(_is_interleaved_transposed)
+    {
         // Allocate intermediate tensors
         _tmp_a.allocator()->allocate();
         _tmp_b.allocator()->allocate();
     }
-    else // The first input tensor is a vector
-    {
-        _run_vector_matrix_multiplication = true;
-
-        // Configure the matrix multiply kernel
-        _mm_kernel.configure(a, b, output, alpha);
-    }
 
     // Configure matrix addition kernel
     if(beta != 0 && c != nullptr)
@@ -125,7 +119,9 @@
 
 void CLGEMM::run()
 {
-    if(!_run_vector_matrix_multiplication)
+    _memory_group.acquire();
+
+    if(_is_interleaved_transposed)
     {
         // Run interleave kernel
         CLScheduler::get().enqueue(_interleave_kernel, false);
@@ -142,4 +138,6 @@
     {
         CLScheduler::get().enqueue(_ma_kernel);
     }
+
+    _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp b/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
index 9dc7715..45547e4 100644
--- a/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
+++ b/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp

@@ -24,13 +24,13 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h"
 
 #include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 void CLGEMMInterleave4x4::configure(const ICLTensor *input, ICLTensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLGEMMInterleave4x4Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLGEMMInterleave4x4Kernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/CL/functions/CLGEMMLowp.cpp b/src/runtime/CL/functions/CLGEMMLowp.cpp
index 45e011d..db6d11c 100644
--- a/src/runtime/CL/functions/CLGEMMLowp.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowp.cpp

@@ -33,8 +33,8 @@
 
 using namespace arm_compute;
 
-CLGEMMLowp::CLGEMMLowp()
-    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
+CLGEMMLowp::CLGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
 {
 }
 
@@ -62,6 +62,10 @@
     TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
     _tmp_b.allocator()->init(info_b);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_tmp_a);
+    _memory_group.manage(&_tmp_b);
+
     // Configure kernels
     _interleave_kernel.configure(a, &_tmp_a);
     _transpose_kernel.configure(b, &_tmp_b);
@@ -74,6 +78,8 @@
 
 void CLGEMMLowp::run()
 {
+    _memory_group.acquire();
+
     /* Run interleave kernel */
     CLScheduler::get().enqueue(_interleave_kernel, false);
 
@@ -82,4 +88,6 @@
 
     /* Run matrix multiply kernel */
     CLScheduler::get().enqueue(_mm_kernel, false);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLGEMMTranspose1xW.cpp b/src/runtime/CL/functions/CLGEMMTranspose1xW.cpp
new file mode 100644
index 0000000..d054e01
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMMTranspose1xW.cpp

@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLGEMMTranspose1xW::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLGEMMTranspose1xWKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLGaussian3x3.cpp b/src/runtime/CL/functions/CLGaussian3x3.cpp
index 362a3fe..7ebabd7 100644
--- a/src/runtime/CL/functions/CLGaussian3x3.cpp
+++ b/src/runtime/CL/functions/CLGaussian3x3.cpp

@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLGaussian3x3.h"
 
 #include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void CLGaussian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLGaussian3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLGaussian3x3Kernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));

diff --git a/src/runtime/CL/functions/CLGaussian5x5.cpp b/src/runtime/CL/functions/CLGaussian5x5.cpp
index e83a8fb..f30eee1 100644
--- a/src/runtime/CL/functions/CLGaussian5x5.cpp
+++ b/src/runtime/CL/functions/CLGaussian5x5.cpp

@@ -35,8 +35,8 @@
 
 using namespace arm_compute;
 
-CLGaussian5x5::CLGaussian5x5()
-    : _kernel_hor(), _kernel_vert(), _border_handler(), _tmp()
+CLGaussian5x5::CLGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _kernel_hor(), _kernel_vert(), _border_handler(), _tmp()
 {
 }
 
@@ -46,6 +46,10 @@
 
     _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, DataType::U16));
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_tmp);
+
+    // Configure kernels
     _kernel_hor.configure(input, &_tmp, border_mode == BorderMode::UNDEFINED);
     _kernel_vert.configure(&_tmp, output, border_mode == BorderMode::UNDEFINED);
     _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
@@ -57,6 +61,11 @@
 void CLGaussian5x5::run()
 {
     CLScheduler::get().enqueue(_border_handler, false);
+
+    _memory_group.acquire();
+
     CLScheduler::get().enqueue(_kernel_hor, false);
     CLScheduler::get().enqueue(_kernel_vert);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
index 8a4279e..8436dce 100644
--- a/src/runtime/CL/functions/CLGaussianPyramid.cpp
+++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp

@@ -27,11 +27,11 @@
 #include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
 #include "arm_compute/core/CL/kernels/CLScaleKernel.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
 
 #include "arm_compute/runtime/CL/CLPyramid.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
@@ -48,8 +48,10 @@
 {
 }
 
-CLGaussianPyramidHalf::CLGaussianPyramidHalf()
-    : _border_handler(), _horizontal_reduction(), _vertical_reduction()
+CLGaussianPyramidHalf::CLGaussianPyramidHalf() // NOLINT
+    : _border_handler(),
+      _horizontal_reduction(),
+      _vertical_reduction()
 {
 }
 
@@ -70,9 +72,9 @@
 
     if(num_levels > 1)
     {
-        _border_handler       = arm_compute::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
-        _horizontal_reduction = arm_compute::cpp14::make_unique<CLGaussianPyramidHorKernel[]>(num_levels - 1);
-        _vertical_reduction   = arm_compute::cpp14::make_unique<CLGaussianPyramidVertKernel[]>(num_levels - 1);
+        _border_handler       = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
+        _horizontal_reduction = arm_compute::support::cpp14::make_unique<CLGaussianPyramidHorKernel[]>(num_levels - 1);
+        _vertical_reduction   = arm_compute::support::cpp14::make_unique<CLGaussianPyramidVertKernel[]>(num_levels - 1);
 
         // Apply half scale to the X dimension of the tensor shape
         TensorShape tensor_shape = pyramid->info()->tensor_shape();
@@ -119,8 +121,9 @@
     }
 }
 
-CLGaussianPyramidOrb::CLGaussianPyramidOrb()
-    : _gauss5x5(), _scale_nearest()
+CLGaussianPyramidOrb::CLGaussianPyramidOrb() // NOLINT
+    : _gauss5x5(),
+      _scale_nearest()
 {
 }
 
@@ -141,8 +144,8 @@
 
     if(num_levels > 1)
     {
-        _gauss5x5      = arm_compute::cpp14::make_unique<CLGaussian5x5[]>(num_levels - 1);
-        _scale_nearest = arm_compute::cpp14::make_unique<CLScaleKernel[]>(num_levels - 1);
+        _gauss5x5      = arm_compute::support::cpp14::make_unique<CLGaussian5x5[]>(num_levels - 1);
+        _scale_nearest = arm_compute::support::cpp14::make_unique<CLScaleKernel[]>(num_levels - 1);
 
         PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
 

diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
index b1b5a03..1470d5c 100644
--- a/src/runtime/CL/functions/CLHOGDescriptor.cpp
+++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp

@@ -31,8 +31,8 @@
 
 using namespace arm_compute;
 
-CLHOGDescriptor::CLHOGDescriptor()
-    : _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
+CLHOGDescriptor::CLHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
 {
 }
 
@@ -71,9 +71,16 @@
     TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
     _hog_space.allocator()->init(info_space);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_mag);
+    _memory_group.manage(&_phase);
+
     // Initialise gradient kernel
     _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_hog_space);
+
     // Initialise orientation binning kernel
     _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
 
@@ -88,6 +95,8 @@
 
 void CLHOGDescriptor::run()
 {
+    _memory_group.acquire();
+
     // Run gradient
     _gradient.run();
 
@@ -96,4 +105,6 @@
 
     // Run block normalization
     CLScheduler::get().enqueue(_block_norm);
+
+    _memory_group.release();
 }
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
index 2387474..51aeaed 100644
--- a/src/runtime/CL/functions/CLHOGGradient.cpp
+++ b/src/runtime/CL/functions/CLHOGGradient.cpp

@@ -29,8 +29,8 @@
 
 using namespace arm_compute;
 
-CLHOGGradient::CLHOGGradient()
-    : _derivative(), _mag_phase(), _gx(), _gy()
+CLHOGGradient::CLHOGGradient(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _derivative(), _mag_phase(), _gx(), _gy()
 {
 }
 
@@ -47,6 +47,10 @@
     _gx.allocator()->init(info);
     _gy.allocator()->init(info);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_gx);
+    _memory_group.manage(&_gy);
+
     // Initialise derivate kernel
     _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
 
@@ -67,9 +71,13 @@
 
 void CLHOGGradient::run()
 {
+    _memory_group.acquire();
+
     // Run derivative
     _derivative.run();
 
     // Run magnitude/phase kernel
     CLScheduler::get().enqueue(_mag_phase);
+
+    _memory_group.release();
 }
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
index b8f2224..8012c2f 100644
--- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp
+++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp

@@ -25,17 +25,31 @@
 
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/CL/CLArray.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/Scheduler.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-CLHOGMultiDetection::CLHOGMultiDetection()
-    : _gradient_kernel(), _orient_bin_kernel(), _block_norm_kernel(), _hog_detect_kernel(), _non_maxima_kernel(), _hog_space(), _hog_norm_space(), _detection_windows(), _mag(), _phase(),
-      _non_maxima_suppression(false), _num_orient_bin_kernel(0), _num_block_norm_kernel(0), _num_hog_detect_kernel(0)
+CLHOGMultiDetection::CLHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _gradient_kernel(),
+      _orient_bin_kernel(),
+      _block_norm_kernel(),
+      _hog_detect_kernel(),
+      _non_maxima_kernel(),
+      _hog_space(),
+      _hog_norm_space(),
+      _detection_windows(),
+      _mag(),
+      _phase(),
+      _non_maxima_suppression(false),
+      _num_orient_bin_kernel(0),
+      _num_block_norm_kernel(0),
+      _num_hog_detect_kernel(0)
 {
 }
 
@@ -114,12 +128,12 @@
     _num_block_norm_kernel  = input_block_norm.size(); // Number of CLHOGBlockNormalizationKernel kernels to compute
     _num_hog_detect_kernel  = input_hog_detect.size(); // Number of CLHOGDetector functions to compute
 
-    _orient_bin_kernel = arm_compute::cpp14::make_unique<CLHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
-    _block_norm_kernel = arm_compute::cpp14::make_unique<CLHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
-    _hog_detect_kernel = arm_compute::cpp14::make_unique<CLHOGDetector[]>(_num_hog_detect_kernel);
-    _non_maxima_kernel = arm_compute::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
-    _hog_space         = arm_compute::cpp14::make_unique<CLTensor[]>(_num_orient_bin_kernel);
-    _hog_norm_space    = arm_compute::cpp14::make_unique<CLTensor[]>(_num_block_norm_kernel);
+    _orient_bin_kernel = arm_compute::support::cpp14::make_unique<CLHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
+    _block_norm_kernel = arm_compute::support::cpp14::make_unique<CLHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
+    _hog_detect_kernel = arm_compute::support::cpp14::make_unique<CLHOGDetector[]>(_num_hog_detect_kernel);
+    _non_maxima_kernel = arm_compute::support::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
+    _hog_space         = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_orient_bin_kernel);
+    _hog_norm_space    = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_block_norm_kernel);
 
     // Allocate tensors for magnitude and phase
     TensorInfo info_mag(shape_img, Format::S16);
@@ -128,6 +142,10 @@
     TensorInfo info_phase(shape_img, Format::U8);
     _phase.allocator()->init(info_phase);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_mag);
+    _memory_group.manage(&_phase);
+
     // Initialise gradient kernel
     _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
 
@@ -153,10 +171,17 @@
         TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
         _hog_space[i].allocator()->init(info_space);
 
+        // Manage intermediate buffers
+        _memory_group.manage(_hog_space.get() + i);
+
         // Initialise orientation binning kernel
         _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
     }
 
+    // Allocate intermediate tensors
+    _mag.allocator()->allocate();
+    _phase.allocator()->allocate();
+
     // Configure CLTensor for the normalized HOG space and block normalization kernel
     for(size_t i = 0; i < _num_block_norm_kernel; ++i)
     {
@@ -167,10 +192,19 @@
         TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
         _hog_norm_space[i].allocator()->init(tensor_info);
 
+        // Manage intermediate buffers
+        _memory_group.manage(_hog_norm_space.get() + i);
+
         // Initialize block normalization kernel
         _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
     }
 
+    // Allocate intermediate tensors
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        _hog_space[i].allocator()->allocate();
+    }
+
     detection_window_strides->map(CLScheduler::get().queue(), true);
 
     // Configure HOG detector kernel
@@ -187,14 +221,6 @@
     _non_maxima_kernel->configure(_detection_windows, min_distance);
 
     // Allocate intermediate tensors
-    _mag.allocator()->allocate();
-    _phase.allocator()->allocate();
-
-    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
-    {
-        _hog_space[i].allocator()->allocate();
-    }
-
     for(size_t i = 0; i < _num_block_norm_kernel; ++i)
     {
         _hog_norm_space[i].allocator()->allocate();
@@ -205,6 +231,8 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
 
+    _memory_group.acquire();
+
     // Reset detection window
     _detection_windows->clear();
 
@@ -234,7 +262,9 @@
     {
         // Map detection windows array before computing non maxima suppression
         _detection_windows->map(CLScheduler::get().queue(), true);
-        _non_maxima_kernel->run(_non_maxima_kernel->window());
+        Scheduler::get().schedule(_non_maxima_kernel.get(), Window::DimY);
         _detection_windows->unmap(CLScheduler::get().queue());
     }
-}
\ No newline at end of file
+
+    _memory_group.release();
+}

diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
index 2db277f..059528f 100644
--- a/src/runtime/CL/functions/CLHarrisCorners.cpp
+++ b/src/runtime/CL/functions/CLHarrisCorners.cpp

@@ -27,7 +27,6 @@
 #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
@@ -36,14 +35,28 @@
 #include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 #include "arm_compute/runtime/Scheduler.h"
+#include "support/ToolchainSupport.h"
 
 #include <cmath>
 #include <utility>
 
 using namespace arm_compute;
 
-CLHarrisCorners::CLHarrisCorners()
-    : _sobel(), _harris_score(), _non_max_suppr(), _candidates(), _sort_euclidean(), _border_gx(), _border_gy(), _gx(), _gy(), _score(), _nonmax(), _corners_list(), _num_corner_candidates(0),
+CLHarrisCorners::CLHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _sobel(nullptr),
+      _harris_score(),
+      _non_max_suppr(),
+      _candidates(),
+      _sort_euclidean(),
+      _border_gx(),
+      _border_gy(),
+      _gx(),
+      _gy(),
+      _score(),
+      _nonmax(),
+      _corners_list(nullptr),
+      _num_corner_candidates(0),
       _corners(nullptr)
 {
 }
@@ -62,6 +75,7 @@
     const TensorShape shape = input->info()->tensor_shape();
     const DataType    dt    = (gradient_size < 7) ? DataType::S16 : DataType::S32;
     TensorInfo        tensor_info(shape, 1, dt);
+
     _gx.allocator()->init(tensor_info);
     _gy.allocator()->init(tensor_info);
 
@@ -69,28 +83,32 @@
     _score.allocator()->init(info_f32);
     _nonmax.allocator()->init(info_f32);
 
-    _corners_list = arm_compute::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+    _corners_list = arm_compute::support::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+
+    // Manage intermediate buffers
+    _memory_group.manage(&_gx);
+    _memory_group.manage(&_gy);
 
     /* Set/init Sobel kernel accordingly with gradient_size */
     switch(gradient_size)
     {
         case 3:
         {
-            auto k = arm_compute::cpp14::make_unique<CLSobel3x3>();
+            auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3>();
             k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
             _sobel = std::move(k);
             break;
         }
         case 5:
         {
-            auto k = arm_compute::cpp14::make_unique<CLSobel5x5>();
+            auto k = arm_compute::support::cpp14::make_unique<CLSobel5x5>();
             k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
             _sobel = std::move(k);
             break;
         }
         case 7:
         {
-            auto k = arm_compute::cpp14::make_unique<CLSobel7x7>();
+            auto k = arm_compute::support::cpp14::make_unique<CLSobel7x7>();
             k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
             _sobel = std::move(k);
             break;
@@ -99,37 +117,49 @@
             ARM_COMPUTE_ERROR("Gradient size not implemented");
     }
 
-    // Configure border filling before harris score
-    _border_gx.configure(&_gx, block_size / 2, border_mode, constant_border_value);
-    _border_gy.configure(&_gy, block_size / 2, border_mode, constant_border_value);
-
     // Normalization factor
     const float norm_factor               = 1.0f / (255.0f * pow(4.0f, gradient_size / 2) * block_size);
     const float pow4_normalization_factor = pow(norm_factor, 4);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_score);
+
     // Set/init Harris Score kernel accordingly with block_size
     _harris_score.configure(&_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
 
-    // Init non-maxima suppression function
-    _non_max_suppr.configure(&_score, &_nonmax, border_mode == BorderMode::UNDEFINED);
-
-    // Init corner candidates kernel
-    _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
-
-    // Init euclidean distance
-    _sort_euclidean.configure(_corners_list.get(), _corners, &_num_corner_candidates, min_dist);
+    // Configure border filling using harris score kernel's block size
+    _border_gx.configure(&_gx, _harris_score.border_size(), border_mode, PixelValue(constant_border_value));
+    _border_gy.configure(&_gy, _harris_score.border_size(), border_mode, PixelValue(constant_border_value));
 
     // Allocate intermediate buffers
     _gx.allocator()->allocate();
     _gy.allocator()->allocate();
+
+    // Manage intermediate buffers
+    _memory_group.manage(&_nonmax);
+
+    // Init non-maxima suppression function
+    _non_max_suppr.configure(&_score, &_nonmax, border_mode);
+
+    // Allocate intermediate buffers
     _score.allocator()->allocate();
+
+    // Init corner candidates kernel
+    _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
+
+    // Allocate intermediate buffers
     _nonmax.allocator()->allocate();
+
+    // Init euclidean distance
+    _sort_euclidean.configure(_corners_list.get(), _corners, &_num_corner_candidates, min_dist);
 }
 
 void CLHarrisCorners::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
 
+    _memory_group.acquire();
+
     // Init to 0 number of corner candidates
     _num_corner_candidates = 0;
 
@@ -144,7 +174,7 @@
     CLScheduler::get().enqueue(_harris_score, false);
 
     // Run non-maxima suppression
-    CLScheduler::get().enqueue(_non_max_suppr);
+    _non_max_suppr.run();
 
     // Run corner candidate kernel
     _nonmax.map(true);
@@ -152,6 +182,8 @@
     _nonmax.unmap();
 
     _corners->map(CLScheduler::get().queue(), true);
-    _sort_euclidean.run(_sort_euclidean.window());
+    Scheduler::get().schedule(&_sort_euclidean, Window::DimY);
     _corners->unmap(CLScheduler::get().queue());
+
+    _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLL2Normalize.cpp b/src/runtime/CL/functions/CLL2Normalize.cpp
new file mode 100644
index 0000000..99be8ca
--- /dev/null
+++ b/src/runtime/CL/functions/CLL2Normalize.cpp

@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLL2Normalize.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLL2NormalizeKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLL2Normalize::CLL2Normalize(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
+{
+}
+
+void CLL2Normalize::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, float epsilon)
+{
+    // Manage intermediate buffers
+    _memory_group.manage(&_sumsq);
+
+    // Configure kernels
+    _reduce_func.configure(input, &_sumsq, axis, ReductionOperation::SUM_SQUARE);
+    _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon);
+
+    // Allocate intermediate tensor
+    _sumsq.allocator()->allocate();
+}
+
+void CLL2Normalize::run()
+{
+    _memory_group.acquire();
+
+    _reduce_func.run();
+    CLScheduler::get().enqueue(_normalize_kernel, true);
+
+    _memory_group.release();
+}

diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
index d7ce206..a395487 100644
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp

@@ -24,7 +24,6 @@
 #include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IPyramid.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
@@ -33,11 +32,18 @@
 #include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
 #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
 #include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-CLLaplacianPyramid::CLLaplacianPyramid()
-    : _num_levels(0), _gaussian_pyr_function(), _convf(), _subf(), _depth_function(), _gauss_pyr(), _conv_pyr()
+CLLaplacianPyramid::CLLaplacianPyramid() // NOLINT
+    : _num_levels(0),
+      _gaussian_pyr_function(),
+      _convf(),
+      _subf(),
+      _depth_function(),
+      _gauss_pyr(),
+      _conv_pyr()
 {
 }
 
@@ -64,8 +70,8 @@
     // Create Gaussian Pyramid function
     _gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
 
-    _convf = arm_compute::cpp14::make_unique<CLGaussian5x5[]>(_num_levels);
-    _subf  = arm_compute::cpp14::make_unique<CLArithmeticSubtraction[]>(_num_levels);
+    _convf = arm_compute::support::cpp14::make_unique<CLGaussian5x5[]>(_num_levels);
+    _subf  = arm_compute::support::cpp14::make_unique<CLArithmeticSubtraction[]>(_num_levels);
 
     for(unsigned int i = 0; i < _num_levels; ++i)
     {

diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
index 1dfab74..678848b 100644
--- a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
+++ b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp

@@ -24,18 +24,21 @@
 #include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IPyramid.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
 
 #include <cstddef>
 
 using namespace arm_compute;
 
-CLLaplacianReconstruct::CLLaplacianReconstruct()
-    : _tmp_pyr(), _addf(), _scalef(), _depthf()
+CLLaplacianReconstruct::CLLaplacianReconstruct() // NOLINT
+    : _tmp_pyr(),
+      _addf(),
+      _scalef(),
+      _depthf()
 {
 }
 
@@ -60,8 +63,8 @@
     _tmp_pyr.init(pyramid_info);
 
     // Allocate add and scale functions. Level 0 does not need to be scaled.
-    _addf   = arm_compute::cpp14::make_unique<CLArithmeticAddition[]>(num_levels);
-    _scalef = arm_compute::cpp14::make_unique<CLScale[]>(num_levels - 1);
+    _addf   = arm_compute::support::cpp14::make_unique<CLArithmeticAddition[]>(num_levels);
+    _scalef = arm_compute::support::cpp14::make_unique<CLScale[]>(num_levels - 1);
 
     const size_t last_level = num_levels - 1;
 

diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index 263fb51..a89a45a 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp

@@ -33,8 +33,9 @@
 
 using namespace arm_compute;
 
-CLLocallyConnectedLayer::CLLocallyConnectedLayer()
-    : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false)
+CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
+      _is_first_run(false)
 {
 }
 
@@ -68,8 +69,8 @@
     // Get convolved dimensions
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
-                                                 stride_x, stride_y, pad_x, pad_y, conv_info.round());
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
+                                                 conv_info);
 
     ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
     ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
@@ -99,8 +100,12 @@
     shape_gemm.set(1, mat_input_rows);
     _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_input_im2col_reshaped);
+    _memory_group.manage(&_gemm_output);
+
     // Configure kernels
-    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(conv_w, conv_h), conv_info, _has_bias);
     _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
     _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
     _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
@@ -120,6 +125,8 @@
         CLScheduler::get().enqueue(_weights_reshape_kernel);
     }
 
+    _memory_group.acquire();
+
     // Run input reshaping
     CLScheduler::get().enqueue(_input_im2col_kernel);
 
@@ -128,4 +135,6 @@
 
     // Reshape output matrix
     CLScheduler::get().enqueue(_output_col2im_kernel, false);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLMagnitude.cpp b/src/runtime/CL/functions/CLMagnitude.cpp
index 51088cb..68b8c35 100644
--- a/src/runtime/CL/functions/CLMagnitude.cpp
+++ b/src/runtime/CL/functions/CLMagnitude.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLMagnitude.h"
 
 #include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLMagnitude::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type)
 {
-    auto k = arm_compute::cpp14::make_unique<CLMagnitudePhaseKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLMagnitudePhaseKernel>();
     k->configure(input1, input2, output, nullptr, mag_type);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp
index 56ba146..838f7e7 100644
--- a/src/runtime/CL/functions/CLMeanStdDev.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDev.cpp

@@ -23,19 +23,19 @@
  */
 #include "arm_compute/runtime/CL/functions/CLMeanStdDev.h"
 
-#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 using namespace arm_compute;
 
 CLMeanStdDev::CLMeanStdDev()
     : _mean_stddev_kernel(),
+      _fill_border_kernel(),
       _global_sum(),
       _global_sum_squared()
 {
 }
 
-void CLMeanStdDev::configure(const ICLImage *input, float *mean, float *stddev)
+void CLMeanStdDev::configure(ICLImage *input, float *mean, float *stddev)
 {
     _global_sum = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
 
@@ -45,9 +45,11 @@
     }
 
     _mean_stddev_kernel.configure(input, mean, &_global_sum, stddev, &_global_sum_squared);
+    _fill_border_kernel.configure(input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
 }
 
 void CLMeanStdDev::run()
 {
+    CLScheduler::get().enqueue(_fill_border_kernel);
     CLScheduler::get().enqueue(_mean_stddev_kernel);
 }

diff --git a/src/runtime/CL/functions/CLMedian3x3.cpp b/src/runtime/CL/functions/CLMedian3x3.cpp
index 0c10f9a..55f9eaa 100644
--- a/src/runtime/CL/functions/CLMedian3x3.cpp
+++ b/src/runtime/CL/functions/CLMedian3x3.cpp

@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLMedian3x3.h"
 
 #include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void CLMedian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLMedian3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLMedian3x3Kernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));

diff --git a/src/runtime/CL/functions/CLMinMaxLocation.cpp b/src/runtime/CL/functions/CLMinMaxLocation.cpp
index ad783d8..49dcbcb 100644
--- a/src/runtime/CL/functions/CLMinMaxLocation.cpp
+++ b/src/runtime/CL/functions/CLMinMaxLocation.cpp

@@ -25,8 +25,8 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 CLMinMaxLocation::CLMinMaxLocation()
     : _min_max_kernel(),
       _min_max_loc_kernel(),
@@ -41,7 +41,7 @@
 {
 }
 
-void CLMinMaxLocation::configure(const ICLImage *input, int32_t *min, int32_t *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
+void CLMinMaxLocation::configure(const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == min);
     ARM_COMPUTE_ERROR_ON(nullptr == max);
@@ -67,8 +67,8 @@
     CLScheduler::get().enqueue(_min_max_loc_kernel, false);
 
     // Update min and max
-    q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 0 * sizeof(int32_t), sizeof(int32_t), _min);
-    q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 1 * sizeof(int32_t), sizeof(int32_t), _max);
+    q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 0 * sizeof(int32_t), sizeof(int32_t), static_cast<int32_t *>(_min));
+    q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 1 * sizeof(int32_t), sizeof(int32_t), static_cast<int32_t *>(_max));
 
     // Update min and max count
     if(_min_count != nullptr)
@@ -96,3 +96,4 @@
         _max_loc->resize(max_corner_size);
     }
 }
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLNonLinearFilter.cpp b/src/runtime/CL/functions/CLNonLinearFilter.cpp
index b593a6c..d37412f 100644
--- a/src/runtime/CL/functions/CLNonLinearFilter.cpp
+++ b/src/runtime/CL/functions/CLNonLinearFilter.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLNonLinearFilter.h"
 
 #include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 void CLNonLinearFilter::configure(ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
                                   BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLNonLinearFilterKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLNonLinearFilterKernel>();
     k->configure(input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));

diff --git a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
index ca7d5ae..c0a0cef 100644
--- a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
+++ b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
 
 #include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLNonMaximaSuppression3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode)
 {
-    auto k = arm_compute::cpp14::make_unique<CLNonMaximaSuppression3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLNonMaximaSuppression3x3Kernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
 

diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
index 2d89ebd..f4bd494 100644
--- a/src/runtime/CL/functions/CLNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp

@@ -33,28 +33,26 @@
 using namespace arm_compute;
 
 CLNormalizationLayer::CLNormalizationLayer()
-    : _squared_input(), _norm_kernel(), _multiply_kernel(), _border_handler()
+    : _norm_kernel(), _border_handler()
 {
 }
 
-void CLNormalizationLayer::configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info)
+void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info)
 {
     ARM_COMPUTE_ERROR_ON(input == nullptr);
 
-    _squared_input.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, input->info()->data_type()));
+    // Configure normalization kernel
+    _norm_kernel.configure(input, output, norm_info);
 
-    _norm_kernel.configure(input, &_squared_input, output, norm_info);
-    _multiply_kernel.configure(input, input, &_squared_input, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
     // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
-    _border_handler.configure(&_squared_input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
-
-    // Allocate intermediate buffers
-    _squared_input.allocator()->allocate();
+    _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
 }
 
 void CLNormalizationLayer::run()
 {
-    CLScheduler::get().enqueue(_multiply_kernel, false);
+    // Run border handler
     CLScheduler::get().enqueue(_border_handler, false);
-    CLScheduler::get().enqueue(_norm_kernel, false);
+
+    // Run normalization kernel
+    CLScheduler::get().enqueue(_norm_kernel);
 }

diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp
index a6b0eb3..d00b1b5 100644
--- a/src/runtime/CL/functions/CLOpticalFlow.cpp
+++ b/src/runtime/CL/functions/CLOpticalFlow.cpp

@@ -26,7 +26,6 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/CL/CLPyramid.h"
@@ -34,12 +33,27 @@
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
 #include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-CLOpticalFlow::CLOpticalFlow()
-    : _tracker_init_kernel(), _tracker_stage0_kernel(), _tracker_stage1_kernel(), _tracker_finalize_kernel(), _func_scharr(), _scharr_gx(), _scharr_gy(), _old_points(nullptr),
-      _new_points_estimates(nullptr), _new_points(nullptr), _old_points_internal(), _new_points_internal(), _coefficient_table(), _old_values(), _num_levels(0)
+CLOpticalFlow::CLOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _tracker_init_kernel(),
+      _tracker_stage0_kernel(),
+      _tracker_stage1_kernel(),
+      _tracker_finalize_kernel(),
+      _func_scharr(),
+      _scharr_gx(),
+      _scharr_gy(),
+      _old_points(nullptr),
+      _new_points_estimates(nullptr),
+      _new_points(nullptr),
+      _old_points_internal(),
+      _new_points_internal(),
+      _coefficient_table(),
+      _old_values(),
+      _num_levels(0)
 {
 }
 
@@ -70,21 +84,21 @@
     const int   old_values_list_length = list_length * window_dimension * window_dimension;
 
     // Create kernels and tensors
-    _tracker_init_kernel   = arm_compute::cpp14::make_unique<CLLKTrackerInitKernel[]>(_num_levels);
-    _tracker_stage0_kernel = arm_compute::cpp14::make_unique<CLLKTrackerStage0Kernel[]>(_num_levels);
-    _tracker_stage1_kernel = arm_compute::cpp14::make_unique<CLLKTrackerStage1Kernel[]>(_num_levels);
-    _func_scharr           = arm_compute::cpp14::make_unique<CLScharr3x3[]>(_num_levels);
-    _scharr_gx             = arm_compute::cpp14::make_unique<CLTensor[]>(_num_levels);
-    _scharr_gy             = arm_compute::cpp14::make_unique<CLTensor[]>(_num_levels);
+    _tracker_init_kernel   = arm_compute::support::cpp14::make_unique<CLLKTrackerInitKernel[]>(_num_levels);
+    _tracker_stage0_kernel = arm_compute::support::cpp14::make_unique<CLLKTrackerStage0Kernel[]>(_num_levels);
+    _tracker_stage1_kernel = arm_compute::support::cpp14::make_unique<CLLKTrackerStage1Kernel[]>(_num_levels);
+    _func_scharr           = arm_compute::support::cpp14::make_unique<CLScharr3x3[]>(_num_levels);
+    _scharr_gx             = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_levels);
+    _scharr_gy             = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_levels);
 
     // Create internal keypoint arrays
-    _old_points_internal = arm_compute::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
+    _old_points_internal = arm_compute::support::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
     _old_points_internal->resize(list_length);
-    _new_points_internal = arm_compute::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
+    _new_points_internal = arm_compute::support::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
     _new_points_internal->resize(list_length);
-    _coefficient_table = arm_compute::cpp14::make_unique<CLCoefficientTableArray>(list_length);
+    _coefficient_table = arm_compute::support::cpp14::make_unique<CLCoefficientTableArray>(list_length);
     _coefficient_table->resize(list_length);
-    _old_values = arm_compute::cpp14::make_unique<CLOldValueArray>(old_values_list_length);
+    _old_values = arm_compute::support::cpp14::make_unique<CLOldValueArray>(old_values_list_length);
     _old_values->resize(old_values_list_length);
     _new_points->resize(list_length);
 
@@ -103,6 +117,10 @@
         _scharr_gx[i].allocator()->init(tensor_info);
         _scharr_gy[i].allocator()->init(tensor_info);
 
+        // Manage intermediate buffers
+        _memory_group.manage(_scharr_gx.get() + i);
+        _memory_group.manage(_scharr_gy.get() + i);
+
         // Init Scharr kernel
         _func_scharr[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
 
@@ -131,6 +149,8 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
 
+    _memory_group.acquire();
+
     for(unsigned int level = _num_levels; level > 0; --level)
     {
         // Run Scharr kernel
@@ -147,4 +167,6 @@
     }
 
     CLScheduler::get().enqueue(_tracker_finalize_kernel, true);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLPhase.cpp b/src/runtime/CL/functions/CLPhase.cpp
index a8cb22b..cf3fa7e 100644
--- a/src/runtime/CL/functions/CLPhase.cpp
+++ b/src/runtime/CL/functions/CLPhase.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLPhase.h"
 
 #include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLPhase::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type)
 {
-    auto k = arm_compute::cpp14::make_unique<CLMagnitudePhaseKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLMagnitudePhaseKernel>();
     k->configure(input1, input2, nullptr, output, MagnitudeType::L1NORM, phase_type);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index 8a86c2e..139d466 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
 
 #include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 void CLPixelWiseMultiplication::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
                                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
-    auto k = arm_compute::cpp14::make_unique<CLPixelWiseMultiplicationKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLPixelWiseMultiplicationKernel>();
     k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 1ef70f4..2cb7d63 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp

@@ -24,14 +24,14 @@
 #include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
 
 #include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info)
 {
     // Configure pooling kernel
-    auto k = arm_compute::cpp14::make_unique<CLPoolingLayerKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLPoolingLayerKernel>();
     k->configure(input, output, pool_info);
     _kernel = std::move(k);
 

diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
new file mode 100644
index 0000000..ed1f51c
--- /dev/null
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp

@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLQuantizationLayer::CLQuantizationLayer()
+    : _quantize_kernel(), _min_max_kernel(), _min_max()
+{
+}
+
+void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    // Configure min-max kernel. _min_max tensor will be auto-configured within the kernel.
+    _min_max_kernel.configure(input, &_min_max);
+
+    // Configure quantize kernel
+    _quantize_kernel.configure(input, output, &_min_max);
+
+    // Allocate min_max tensor
+    _min_max.allocator()->allocate();
+}
+
+void CLQuantizationLayer::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+
+    // Reset min and max
+    _min_max_kernel.reset(q);
+
+    // Run min-max kernel
+    CLScheduler::get().enqueue(_min_max_kernel, false);
+
+    // Run quantize kernel
+    CLScheduler::get().enqueue(_quantize_kernel, false);
+}

diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
new file mode 100644
index 0000000..0f480ee
--- /dev/null
+++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp

@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h"
+
+#include "arm_compute/core/CL/ICLArray.h"
+
+#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLROIArray *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+{
+    // Configure ROI pooling kernel
+    auto k = arm_compute::support::cpp14::make_unique<CLROIPoolingLayerKernel>();
+    k->configure(input, rois, output, pool_info);
+    _kernel = std::move(k);
+}

diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
new file mode 100644
index 0000000..d02afb4
--- /dev/null
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp

@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages()
+{
+}
+
+void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+{
+    // Calculate number of WGs. 16 elements per thread, 8 threads per WG
+    unsigned int num_of_wg = ceil(input->info()->dimension(0) / 128.f);
+
+    // Calculate number of stages. First stage performs op and the rest reduction sum
+    // depending on the size of the input. Last stage should have only 1 WG.
+    _num_of_stages = num_of_wg / 128 + 2;
+
+    // Create temporary tensors
+    _sums_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
+
+    // Configure reduction operation kernels
+    _reduction_kernels_vector = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel[]>(_num_of_stages);
+    _border_handlers_vector   = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
+
+    TensorShape shape{ input->info()->tensor_shape() };
+    for(unsigned int i = 0; i < _num_of_stages - 1; i++)
+    {
+        shape.set(0, ceil(shape.x() / 128.f));
+        _sums_vector[i].allocator()->init(TensorInfo(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+    }
+
+    // Apply ReductionOperation only on first kernel
+    _memory_group.manage(_sums_vector.get());
+    _reduction_kernels_vector[0].configure(input, _sums_vector.get(), axis, op);
+    _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+    // Apply ReductionOperation on intermediate stages
+    for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
+    {
+        _memory_group.manage(_sums_vector.get() + i);
+        _reduction_kernels_vector[i].configure(_sums_vector.get() + i - 1, _sums_vector.get() + i, axis, ReductionOperation::SUM);
+        _border_handlers_vector[i].configure(_sums_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+        _sums_vector[i - 1].allocator()->allocate();
+    }
+
+    // Apply ReductionOperation on the last stage
+    const unsigned int last_stage = _num_of_stages - 1;
+    _reduction_kernels_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, output, axis, ReductionOperation::SUM);
+    _border_handlers_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, PixelValue(0));
+    _sums_vector[last_stage - 1].allocator()->allocate();
+}
+
+void CLReductionOperation::run()
+{
+    _memory_group.acquire();
+
+    for(unsigned int i = 0; i < _num_of_stages; ++i)
+    {
+        CLScheduler::get().enqueue(_border_handlers_vector[i], false);
+        CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
+    }
+
+    _memory_group.release();
+}
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLRemap.cpp b/src/runtime/CL/functions/CLRemap.cpp
index f6b1713..bc3fd4e 100644
--- a/src/runtime/CL/functions/CLRemap.cpp
+++ b/src/runtime/CL/functions/CLRemap.cpp

@@ -26,10 +26,10 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLRemapKernel.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -43,7 +43,7 @@
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
 
-    auto k = arm_compute::cpp14::make_unique<CLRemapKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLRemapKernel>();
     k->configure(input, map_x, map_y, output, policy, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));

diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp
new file mode 100644
index 0000000..2ce83dc
--- /dev/null
+++ b/src/runtime/CL/functions/CLReshapeLayer.cpp

@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLReshapeLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLReshapeLayerKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}

diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index 043f873..49b0275 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp

@@ -26,19 +26,14 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLScaleKernel.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value)
 {
-    ARM_COMPUTE_ERROR_ON(output == input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    auto k = arm_compute::cpp14::make_unique<CLScaleKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLScaleKernel>();
     k->configure(input, output, policy, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);

diff --git a/src/runtime/CL/functions/CLScharr3x3.cpp b/src/runtime/CL/functions/CLScharr3x3.cpp
index c8bc465..73f8673 100644
--- a/src/runtime/CL/functions/CLScharr3x3.cpp
+++ b/src/runtime/CL/functions/CLScharr3x3.cpp

@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
 
 #include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void CLScharr3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLScharr3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLScharr3x3Kernel>();
     k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));

diff --git a/src/runtime/CL/functions/CLSobel3x3.cpp b/src/runtime/CL/functions/CLSobel3x3.cpp
index 6b74eba..e227e58 100644
--- a/src/runtime/CL/functions/CLSobel3x3.cpp
+++ b/src/runtime/CL/functions/CLSobel3x3.cpp

@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
 
 #include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void CLSobel3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLSobel3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3Kernel>();
     k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));

diff --git a/src/runtime/CL/functions/CLSobel5x5.cpp b/src/runtime/CL/functions/CLSobel5x5.cpp
index 098b546..d4bc855 100644
--- a/src/runtime/CL/functions/CLSobel5x5.cpp
+++ b/src/runtime/CL/functions/CLSobel5x5.cpp

@@ -33,8 +33,8 @@
 
 using namespace arm_compute;
 
-CLSobel5x5::CLSobel5x5()
-    : _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
+CLSobel5x5::CLSobel5x5(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
 {
 }
 
@@ -51,6 +51,8 @@
     {
         _tmp_x.allocator()->init(tensor_info);
         _tmp_y.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_x);
+        _memory_group.manage(&_tmp_y);
         _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
@@ -59,6 +61,7 @@
     else if(run_sobel_x)
     {
         _tmp_x.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_x);
         _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
@@ -66,6 +69,7 @@
     else if(run_sobel_y)
     {
         _tmp_y.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_y);
         _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_y.allocator()->allocate();
@@ -76,6 +80,11 @@
 void CLSobel5x5::run()
 {
     CLScheduler::get().enqueue(_border_handler, false);
+
+    _memory_group.acquire();
+
     CLScheduler::get().enqueue(_sobel_hor, false);
     CLScheduler::get().enqueue(_sobel_vert);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLSobel7x7.cpp b/src/runtime/CL/functions/CLSobel7x7.cpp
index db84fa9..6083090 100644
--- a/src/runtime/CL/functions/CLSobel7x7.cpp
+++ b/src/runtime/CL/functions/CLSobel7x7.cpp

@@ -33,8 +33,8 @@
 
 using namespace arm_compute;
 
-CLSobel7x7::CLSobel7x7()
-    : _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
+CLSobel7x7::CLSobel7x7(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
 {
 }
 
@@ -51,6 +51,8 @@
     {
         _tmp_x.allocator()->init(tensor_info);
         _tmp_y.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_x);
+        _memory_group.manage(&_tmp_y);
         _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
@@ -59,6 +61,7 @@
     else if(run_sobel_x)
     {
         _tmp_x.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_x);
         _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
@@ -66,6 +69,7 @@
     else if(run_sobel_y)
     {
         _tmp_y.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_y);
         _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_y.allocator()->allocate();
@@ -76,6 +80,11 @@
 void CLSobel7x7::run()
 {
     CLScheduler::get().enqueue(_border_handler, false);
+
+    _memory_group.acquire();
+
     CLScheduler::get().enqueue(_sobel_hor, false);
     CLScheduler::get().enqueue(_sobel_vert);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index 2a78c58..7505a2c 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp

@@ -25,29 +25,34 @@
 
 #include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 using namespace arm_compute;
 
-CLSoftmaxLayer::CLSoftmaxLayer()
-    : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
+CLSoftmaxLayer::CLSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
 {
 }
 
 void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
 
     // Create intermediate tensors shapes
-    _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type()));
+    _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
 
     TensorShape shape = input->info()->tensor_shape();
     shape.set(0, 1);
-    TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type());
+    TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
     _max.allocator()->init(tensor_info_max_sum);
     _sum.allocator()->init(tensor_info_max_sum);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_tmp);
+    _memory_group.manage(&_max);
+    _memory_group.manage(&_sum);
+
     // Configure Kernels
     _max_kernel.configure(input, &_max);
     _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
@@ -61,7 +66,11 @@
 
 void CLSoftmaxLayer::run()
 {
+    _memory_group.acquire();
+
     CLScheduler::get().enqueue(_max_kernel, false);
     CLScheduler::get().enqueue(_shift_exp_sum_kernel, false);
     CLScheduler::get().enqueue(_norm_kernel);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLTableLookup.cpp b/src/runtime/CL/functions/CLTableLookup.cpp
index 743ed5e..d187650 100644
--- a/src/runtime/CL/functions/CLTableLookup.cpp
+++ b/src/runtime/CL/functions/CLTableLookup.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLTableLookup.h"
 
 #include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLTableLookup::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLTableLookupKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLTableLookupKernel>();
     k->configure(input, lut, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/CL/functions/CLThreshold.cpp b/src/runtime/CL/functions/CLThreshold.cpp
index e70f932..1b30b77 100644
--- a/src/runtime/CL/functions/CLThreshold.cpp
+++ b/src/runtime/CL/functions/CLThreshold.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLThreshold.h"
 
 #include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLThreshold::configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
 {
-    auto k = arm_compute::cpp14::make_unique<CLThresholdKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLThresholdKernel>();
     k->configure(input, output, threshold, false_value, true_value, type, upper);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp
index d802b4f..cd19e25 100644
--- a/src/runtime/CL/functions/CLTranspose.cpp
+++ b/src/runtime/CL/functions/CLTranspose.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLTranspose.h"
 
 #include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void CLTranspose::configure(const ICLTensor *input, ICLTensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<CLTransposeKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLWarpAffine.cpp b/src/runtime/CL/functions/CLWarpAffine.cpp
index 537e0d9..f785c75 100644
--- a/src/runtime/CL/functions/CLWarpAffine.cpp
+++ b/src/runtime/CL/functions/CLWarpAffine.cpp

@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLWarpAffine.h"
 
 #include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void CLWarpAffine::configure(ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLWarpAffineKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLWarpAffineKernel>();
     k->configure(input, output, matrix, policy);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));

diff --git a/src/runtime/CL/functions/CLWarpPerspective.cpp b/src/runtime/CL/functions/CLWarpPerspective.cpp
index a552ab4..b445b3b 100644
--- a/src/runtime/CL/functions/CLWarpPerspective.cpp
+++ b/src/runtime/CL/functions/CLWarpPerspective.cpp

@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLWarpPerspective.h"
 
 #include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void CLWarpPerspective::configure(ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<CLWarpPerspectiveKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLWarpPerspectiveKernel>();
     k->configure(input, output, matrix, policy);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));

diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 8869330..a83a0bc 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp

@@ -28,91 +28,89 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
 
+#include <condition_variable>
 #include <iostream>
-#include <semaphore.h>
+#include <mutex>
 #include <system_error>
 #include <thread>
 
-using namespace arm_compute;
-
-class arm_compute::Thread
+namespace arm_compute
+{
+class Thread
 {
 public:
-    /** Start a new thread
-     */
+    /** Start a new thread. */
     Thread();
+
     Thread(const Thread &) = delete;
     Thread &operator=(const Thread &) = delete;
     Thread(Thread &&)                 = delete;
     Thread &operator=(Thread &&) = delete;
-    /** Make the thread join
-     */
+
+    /** Destructor. Make the thread join. */
     ~Thread();
+
     /** Request the worker thread to start executing the given kernel
      * This function will return as soon as the kernel has been sent to the worker thread.
      * wait() needs to be called to ensure the execution is complete.
      */
-    void start(ICPPKernel *kernel, const Window &window);
-    /** Wait for the current kernel execution to complete
-     */
+    void start(ICPPKernel *kernel, const Window &window, const ThreadInfo &info);
+
+    /** Wait for the current kernel execution to complete. */
     void wait();
-    /** Function ran by the worker thread
-     */
+
+    /** Function ran by the worker thread. */
     void worker_thread();
 
 private:
-    std::thread        _thread;
-    ICPPKernel        *_kernel{ nullptr };
-    Window             _window;
-    sem_t              _wait_for_work;
-    sem_t              _job_complete;
-    std::exception_ptr _current_exception;
+    std::thread             _thread;
+    ICPPKernel             *_kernel{ nullptr };
+    Window                  _window;
+    ThreadInfo              _info;
+    std::mutex              _m;
+    std::condition_variable _cv;
+    bool                    _wait_for_work{ false };
+    bool                    _job_complete{ true };
+    std::exception_ptr      _current_exception;
 };
 
 Thread::Thread()
-    : _thread(), _window(), _wait_for_work(), _job_complete(), _current_exception(nullptr)
+    : _thread(), _window(), _info(), _m(), _cv(), _current_exception(nullptr)
 {
-    int ret = sem_init(&_wait_for_work, 0, 0);
-    ARM_COMPUTE_ERROR_ON(ret < 0);
-    ARM_COMPUTE_UNUSED(ret);
-
-    ret = sem_init(&_job_complete, 0, 0);
-    ARM_COMPUTE_ERROR_ON(ret < 0);
-    ARM_COMPUTE_UNUSED(ret);
-
     _thread = std::thread(&Thread::worker_thread, this);
 }
 
 Thread::~Thread()
 {
-    ARM_COMPUTE_ERROR_ON(!_thread.joinable());
-
-    start(nullptr, Window());
-    _thread.join();
-
-    int ret = sem_destroy(&_wait_for_work);
-    ARM_COMPUTE_ERROR_ON(ret < 0);
-    ARM_COMPUTE_UNUSED(ret);
-
-    ret = sem_destroy(&_job_complete);
-    ARM_COMPUTE_ERROR_ON(ret < 0);
-    ARM_COMPUTE_UNUSED(ret);
+    // Make sure worker thread has ended
+    if(_thread.joinable())
+    {
+        start(nullptr, Window(), ThreadInfo());
+        _thread.join();
+    }
 }
 
-void Thread::start(ICPPKernel *kernel, const Window &window)
+void Thread::start(ICPPKernel *kernel, const Window &window, const ThreadInfo &info)
 {
     _kernel = kernel;
     _window = window;
-    int ret = sem_post(&_wait_for_work);
-    ARM_COMPUTE_UNUSED(ret);
-    ARM_COMPUTE_ERROR_ON(ret < 0);
+    _info   = info;
+
+    {
+        std::lock_guard<std::mutex> lock(_m);
+        _wait_for_work = true;
+        _job_complete  = false;
+    }
+    _cv.notify_one();
 }
 
 void Thread::wait()
 {
-    int ret = sem_wait(&_job_complete);
-    ARM_COMPUTE_UNUSED(ret);
-    ARM_COMPUTE_ERROR_ON(ret < 0);
+    {
+        std::unique_lock<std::mutex> lock(_m);
+        _cv.wait(lock, [&] { return _job_complete; });
+    }
+
     if(_current_exception)
     {
         std::rethrow_exception(_current_exception);
@@ -121,9 +119,14 @@
 
 void Thread::worker_thread()
 {
-    while(sem_wait(&_wait_for_work) >= 0)
+    while(true)
     {
+        std::unique_lock<std::mutex> lock(_m);
+        _cv.wait(lock, [&] { return _wait_for_work; });
+        _wait_for_work = false;
+
         _current_exception = nullptr;
+
         // Time to exit
         if(_kernel == nullptr)
         {
@@ -133,49 +136,40 @@
         try
         {
             _window.validate();
-            _kernel->run(_window);
+            _kernel->run(_window, _info);
         }
         catch(...)
         {
             _current_exception = std::current_exception();
         }
-        int ret = sem_post(&_job_complete);
-        ARM_COMPUTE_UNUSED(ret);
-        ARM_COMPUTE_ERROR_ON(ret < 0);
+
+        _job_complete = true;
+        lock.unlock();
+        _cv.notify_one();
     }
-
-    ARM_COMPUTE_ERROR("Wait failed");
 }
 
-namespace
-{
-void delete_threads(Thread *t)
-{
-    delete[] t;
-}
-} // namespace
-
 CPPScheduler &CPPScheduler::get()
 {
     static CPPScheduler scheduler;
     return scheduler;
 }
 
-unsigned int CPPScheduler::num_threads() const
-{
-    return _num_threads;
-}
-
 CPPScheduler::CPPScheduler()
     : _num_threads(std::thread::hardware_concurrency()),
-      _threads(std::unique_ptr<Thread[], void(*)(Thread *)>(new Thread[std::thread::hardware_concurrency() - 1], delete_threads))
+      _threads(_num_threads - 1)
 {
 }
 
 void CPPScheduler::set_num_threads(unsigned int num_threads)
 {
-    const unsigned int num_cores = std::thread::hardware_concurrency();
-    _num_threads                 = num_threads == 0 ? num_cores : num_threads;
+    _num_threads = num_threads == 0 ? std::thread::hardware_concurrency() : num_threads;
+    _threads.resize(_num_threads - 1);
+}
+
+unsigned int CPPScheduler::num_threads() const
+{
+    return _num_threads;
 }
 
 void CPPScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
@@ -183,43 +177,51 @@
     ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
 
     /** [Scheduler example] */
+    ThreadInfo info;
+    info.cpu_info = _info;
+
     const Window      &max_window     = kernel->window();
     const unsigned int num_iterations = max_window.num_iterations(split_dimension);
-    const unsigned int num_threads    = std::min(num_iterations, _num_threads);
+    info.num_threads                  = std::min(num_iterations, _num_threads);
 
-    if(!kernel->is_parallelisable() || 1 == num_threads)
+    if(num_iterations == 0)
     {
-        kernel->run(max_window);
+        return;
+    }
+
+    if(!kernel->is_parallelisable() || info.num_threads == 1)
+    {
+        kernel->run(max_window, info);
     }
     else
     {
-        for(unsigned int t = 0; t < num_threads; ++t)
-        {
-            Window win = max_window.split_window(split_dimension, t, num_threads);
-            win.set_thread_id(t);
-            win.set_num_threads(num_threads);
+        int  t         = 0;
+        auto thread_it = _threads.begin();
 
-            if(t != num_threads - 1)
-            {
-                _threads[t].start(kernel, win);
-            }
-            else
-            {
-                kernel->run(win);
-            }
+        for(; t < info.num_threads - 1; ++t, ++thread_it)
+        {
+            Window win     = max_window.split_window(split_dimension, t, info.num_threads);
+            info.thread_id = t;
+            thread_it->start(kernel, win, info);
         }
 
+        // Run last part on main thread
+        Window win     = max_window.split_window(split_dimension, t, info.num_threads);
+        info.thread_id = t;
+        kernel->run(win, info);
+
         try
         {
-            for(unsigned int t = 1; t < num_threads; ++t)
+            for(auto &thread : _threads)
             {
-                _threads[t - 1].wait();
+                thread.wait();
             }
         }
         catch(const std::system_error &e)
         {
-            std::cout << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
+            std::cerr << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
         }
     }
     /** [Scheduler example] */
 }
+} // namespace arm_compute

diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
index f086813..c8285b4 100644
--- a/src/runtime/CPP/SingleThreadScheduler.cpp
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp

@@ -27,8 +27,8 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Utils.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 SingleThreadScheduler &SingleThreadScheduler::get()
 {
     static SingleThreadScheduler scheduler;
@@ -38,15 +38,19 @@
 void SingleThreadScheduler::set_num_threads(unsigned int num_threads)
 {
     ARM_COMPUTE_UNUSED(num_threads);
+    ARM_COMPUTE_ERROR_ON(num_threads != 1);
 }
 
 void SingleThreadScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
 {
     ARM_COMPUTE_UNUSED(split_dimension);
-    kernel->run(kernel->window());
+    ThreadInfo info;
+    info.cpu_info = cpu_info();
+    kernel->run(kernel->window(), info);
 }
 
 unsigned int SingleThreadScheduler::num_threads() const
 {
     return 1;
 }
+} // namespace arm_compute

diff --git a/src/runtime/Distribution1D.cpp b/src/runtime/Distribution1D.cpp
index b067674..3431834 100644
--- a/src/runtime/Distribution1D.cpp
+++ b/src/runtime/Distribution1D.cpp

@@ -24,14 +24,14 @@
 #include "arm_compute/runtime/Distribution1D.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 #include <cstdint>
 
 using namespace arm_compute;
 
 Distribution1D::Distribution1D(size_t num_bins, int32_t offset, uint32_t range)
-    : IDistribution1D(num_bins, offset, range), _data(arm_compute::cpp14::make_unique<uint32_t[]>(num_bins))
+    : IDistribution1D(num_bins, offset, range), _data(arm_compute::support::cpp14::make_unique<uint32_t[]>(num_bins))
 {
 }
 

diff --git a/src/runtime/HOG.cpp b/src/runtime/HOG.cpp
index 5d533dd..01640bb 100644
--- a/src/runtime/HOG.cpp
+++ b/src/runtime/HOG.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/HOG.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
@@ -37,7 +37,7 @@
 {
     ARM_COMPUTE_ERROR_ON(nullptr != _descriptor);
     _info       = input;
-    _descriptor = arm_compute::cpp14::make_unique<float[]>(_info.descriptor_size());
+    _descriptor = arm_compute::support::cpp14::make_unique<float[]>(_info.descriptor_size());
 }
 
 float *HOG::descriptor() const

diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
new file mode 100644
index 0000000..4292469
--- /dev/null
+++ b/src/runtime/IScheduler.cpp

@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/IScheduler.h"
+
+#include <array>
+#include <cstdlib>
+#include <cstring>
+#include <fcntl.h>
+#include <sched.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+namespace
+{
+unsigned int get_cpu_impl()
+{
+#ifndef BARE_METAL
+    int fd = open("/proc/cpuinfo", 0); // NOLINT
+    std::array<char, 1200> buff{ {} };
+    char *pos     = nullptr;
+    char *end     = nullptr;
+    bool  foundid = false;
+
+    int cpu = sched_getcpu();
+
+    if(fd == -1)
+    {
+        return 0;
+    }
+
+    int charsread = read(fd, buff.data(), 1200);
+    pos           = buff.data();
+    end           = buff.data() + charsread;
+
+    close(fd);
+
+    /* So, to date I've encountered two formats for /proc/cpuinfo.
+     *
+     * One of them just lists processor : n  for each processor (with no
+     * other info), then at the end lists part information for the current
+     * CPU.
+     *
+     * The other has an entire clause (including part number info) for each
+     * CPU in the system, with "processor : n" headers.
+     *
+     * We can cope with either of these formats by waiting to see
+     * "processor: n" (where n = our CPU ID), and then looking for the next
+     * "CPU part" field.
+     */
+    while(pos < end)
+    {
+        if(foundid && strncmp(pos, "CPU part", 8) == 0)
+        {
+            /* Found part number */
+            pos += 11;
+
+            for(char *ch = pos; ch < end; ch++)
+            {
+                if(*ch == '\n')
+                {
+                    *ch = '\0';
+                    break;
+                }
+            }
+
+            return strtoul(pos, nullptr, 0);
+        }
+
+        if(strncmp(pos, "processor", 9) == 0)
+        {
+            /* Found processor ID, see if it's ours. */
+            pos += 11;
+
+            for(char *ch = pos; ch < end; ch++)
+            {
+                if(*ch == '\n')
+                {
+                    *ch = '\0';
+                    break;
+                }
+            }
+
+            int num = strtol(pos, nullptr, 0);
+
+            if(num == cpu)
+            {
+                foundid = true;
+            }
+        }
+
+        while(pos < end)
+        {
+            char ch = *pos++;
+            if(ch == '\n' || ch == '\0')
+            {
+                break;
+            }
+        }
+    }
+#endif /* BARE_METAL */
+
+    return 0;
+}
+} // namespace
+
+namespace arm_compute
+{
+IScheduler::IScheduler()
+{
+    switch(get_cpu_impl())
+    {
+        case 0xd03:
+            _info.CPU = CPUTarget::A53;
+            break;
+        default:
+#ifdef __arm__
+            _info.CPU = CPUTarget::ARMV7;
+#elif __aarch64__
+            _info.CPU = CPUTarget::ARMV8;
+#else  /* __arm__ || __aarch64__ */
+            _info.CPU = CPUTarget::INTRINSICS;
+#endif /* __arm__ || __aarch64__ */
+            break;
+    }
+
+    _info.L1_size = 31000;
+    _info.L2_size = 500000;
+}
+
+void IScheduler::set_target(CPUTarget target)
+{
+    _info.CPU = target;
+}
+
+CPUInfo IScheduler::cpu_info() const
+{
+    return _info;
+}
+} // namespace arm_compute

diff --git a/src/runtime/LutAllocator.cpp b/src/runtime/LutAllocator.cpp
index 17baf21..eb9051c 100644
--- a/src/runtime/LutAllocator.cpp
+++ b/src/runtime/LutAllocator.cpp

@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/LutAllocator.h"
 
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
@@ -39,7 +39,7 @@
 
 void LutAllocator::allocate()
 {
-    _buffer = arm_compute::cpp14::make_unique<uint8_t[]>(size());
+    _buffer = arm_compute::support::cpp14::make_unique<uint8_t[]>(size());
 }
 
 uint8_t *LutAllocator::lock()

diff --git a/src/runtime/MemoryManagerOnDemand.cpp b/src/runtime/MemoryManagerOnDemand.cpp
new file mode 100644
index 0000000..4dfa28b
--- /dev/null
+++ b/src/runtime/MemoryManagerOnDemand.cpp

@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/ILifetimeManager.h"
+#include "arm_compute/runtime/IPoolManager.h"
+
+#include <memory>
+
+using namespace arm_compute;
+
+MemoryManagerOnDemand::MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager, std::shared_ptr<IPoolManager> pool_manager)
+    : _lifetime_mgr(std::move(lifetime_manager)), _pool_mgr(std::move(pool_manager)), _allocator(nullptr), _is_finalized(false), _num_pools(1)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!_lifetime_mgr, "Lifetime manager not specified correctly!");
+    ARM_COMPUTE_ERROR_ON_MSG(!_pool_mgr, "Pool manager not specified correctly!");
+}
+
+bool MemoryManagerOnDemand::is_finalized() const
+{
+    return _is_finalized;
+}
+
+void MemoryManagerOnDemand::set_num_pools(unsigned int num_pools)
+{
+    ARM_COMPUTE_ERROR_ON(num_pools == 0);
+    _num_pools = num_pools;
+}
+
+void MemoryManagerOnDemand::set_allocator(IAllocator *allocator)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(is_finalized(), "Memory manager is already finalized!");
+    ARM_COMPUTE_ERROR_ON(allocator == nullptr);
+    _allocator = allocator;
+}
+
+ILifetimeManager *MemoryManagerOnDemand::lifetime_manager()
+{
+    return _lifetime_mgr.get();
+}
+
+IPoolManager *MemoryManagerOnDemand::pool_manager()
+{
+    return _pool_mgr.get();
+}
+
+void MemoryManagerOnDemand::finalize()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(is_finalized(), "Memory manager is already finalized!");
+    ARM_COMPUTE_ERROR_ON(!_lifetime_mgr);
+    ARM_COMPUTE_ERROR_ON(!_pool_mgr);
+    ARM_COMPUTE_ERROR_ON_MSG(!_lifetime_mgr->are_all_finalized(), "All the objects have not been finalized! ");
+    ARM_COMPUTE_ERROR_ON(_allocator == nullptr);
+
+    // Create pools
+    auto pool_template = _lifetime_mgr->create_pool(_allocator);
+    for(int i = _num_pools; i > 1; --i)
+    {
+        auto pool = pool_template->duplicate();
+        _pool_mgr->register_pool(std::move(pool));
+    }
+    _pool_mgr->register_pool(std::move(pool_template));
+
+    // Set finalized to true
+    _is_finalized = true;
+}

diff --git a/src/runtime/MultiHOG.cpp b/src/runtime/MultiHOG.cpp
index 003dc93..e0b60b1 100644
--- a/src/runtime/MultiHOG.cpp
+++ b/src/runtime/MultiHOG.cpp

@@ -24,13 +24,13 @@
 #include "arm_compute/runtime/MultiHOG.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IMultiHOG.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 MultiHOG::MultiHOG(size_t num_models)
-    : _num_models(num_models), _model(arm_compute::cpp14::make_unique<HOG[]>(_num_models))
+    : _num_models(num_models), _model(arm_compute::support::cpp14::make_unique<HOG[]>(_num_models))
 {
 }
 

diff --git a/src/runtime/NEON/INESimpleFunction.cpp b/src/runtime/NEON/INESimpleFunction.cpp
index 6f0da85..23d9872 100644
--- a/src/runtime/NEON/INESimpleFunction.cpp
+++ b/src/runtime/NEON/INESimpleFunction.cpp

@@ -27,13 +27,14 @@
 
 using namespace arm_compute;
 
-INESimpleFunction::INESimpleFunction()
-    : _kernel(), _border_handler()
+INESimpleFunction::INESimpleFunction() // NOLINT
+    : _kernel(),
+      _border_handler()
 {
 }
 
 void INESimpleFunction::run()
 {
-    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
     NEScheduler::get().schedule(_kernel.get(), Window::DimY);
 }

diff --git a/src/runtime/NEON/functions/NEAbsoluteDifference.cpp b/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
index b39feb3..b4620f1 100644
--- a/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
+++ b/src/runtime/NEON/functions/NEAbsoluteDifference.cpp

@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NEAbsoluteDifference::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEAbsoluteDifferenceKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEAbsoluteDifferenceKernel>();
     k->configure(input1, input2, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NEAccumulate.cpp b/src/runtime/NEON/functions/NEAccumulate.cpp
index c39abfc..49524d2 100644
--- a/src/runtime/NEON/functions/NEAccumulate.cpp
+++ b/src/runtime/NEON/functions/NEAccumulate.cpp

@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEAccumulate.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEAccumulateKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NEAccumulate::configure(const ITensor *input, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEAccumulateKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEAccumulateKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
@@ -41,13 +41,13 @@
 {
     if(use_fp16)
     {
-        auto k = arm_compute::cpp14::make_unique<NEAccumulateWeightedFP16Kernel>();
+        auto k = arm_compute::support::cpp14::make_unique<NEAccumulateWeightedFP16Kernel>();
         k->configure(input, alpha, output);
         _kernel = std::move(k);
     }
     else
     {
-        auto k = arm_compute::cpp14::make_unique<NEAccumulateWeightedKernel>();
+        auto k = arm_compute::support::cpp14::make_unique<NEAccumulateWeightedKernel>();
         k->configure(input, alpha, output);
         _kernel = std::move(k);
     }
@@ -55,7 +55,7 @@
 
 void NEAccumulateSquared::configure(const ITensor *input, uint32_t shift, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEAccumulateSquaredKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEAccumulateSquaredKernel>();
     k->configure(input, shift, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
index f5d81d7..57a1738 100644
--- a/src/runtime/NEON/functions/NEActivationLayer.cpp
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp

@@ -23,14 +23,14 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-void NEActivationLayer::configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
+void NEActivationLayer::configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
 {
-    auto k = arm_compute::cpp14::make_unique<NEActivationLayerKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEActivationLayerKernel>();
     k->configure(input, output, activation_info);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
index 50cc38b..11f5aa7 100644
--- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp

@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NEArithmeticAddition::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
 {
-    auto k = arm_compute::cpp14::make_unique<NEArithmeticAdditionKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEArithmeticAdditionKernel>();
     k->configure(input1, input2, output, policy);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
index a3d27c0..37586af 100644
--- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp

@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NEArithmeticSubtraction::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
 {
-    auto k = arm_compute::cpp14::make_unique<NEArithmeticSubtractionKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEArithmeticSubtractionKernel>();
     k->configure(input1, input2, output, policy);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
index a24429c..ef79b02 100644
--- a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp

@@ -37,7 +37,7 @@
 {
 }
 
-void NEBatchNormalizationLayer::configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
+void NEBatchNormalizationLayer::configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
 {
     // Configure kernel
     _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);

diff --git a/src/runtime/NEON/functions/NEBitwiseAnd.cpp b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
index 5aafc51..7982095 100644
--- a/src/runtime/NEON/functions/NEBitwiseAnd.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseAnd.cpp

@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NEBitwiseAnd::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEBitwiseAndKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEBitwiseAndKernel>();
     k->configure(input1, input2, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NEBitwiseNot.cpp b/src/runtime/NEON/functions/NEBitwiseNot.cpp
index af3df6e..c55957e 100644
--- a/src/runtime/NEON/functions/NEBitwiseNot.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseNot.cpp

@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NEBitwiseNot::configure(const ITensor *input, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEBitwiseNotKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEBitwiseNotKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NEBitwiseOr.cpp b/src/runtime/NEON/functions/NEBitwiseOr.cpp
index d12c5e5..01036da 100644
--- a/src/runtime/NEON/functions/NEBitwiseOr.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseOr.cpp

@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NEBitwiseOr::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEBitwiseOrKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEBitwiseOrKernel>();
     k->configure(input1, input2, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NEBitwiseXor.cpp b/src/runtime/NEON/functions/NEBitwiseXor.cpp
index 65c943e..4591698 100644
--- a/src/runtime/NEON/functions/NEBitwiseXor.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseXor.cpp

@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseXor.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NEBitwiseXor::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEBitwiseXorKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEBitwiseXorKernel>();
     k->configure(input1, input2, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NEBox3x3.cpp b/src/runtime/NEON/functions/NEBox3x3.cpp
index 7f0b45d..46cf259 100644
--- a/src/runtime/NEON/functions/NEBox3x3.cpp
+++ b/src/runtime/NEON/functions/NEBox3x3.cpp

@@ -23,9 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBox3x3.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEBox3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -35,13 +35,13 @@
 {
     if(use_fp16)
     {
-        auto k = arm_compute::cpp14::make_unique<NEBox3x3FP16Kernel>();
+        auto k = arm_compute::support::cpp14::make_unique<NEBox3x3FP16Kernel>();
         k->configure(input, output, border_mode == BorderMode::UNDEFINED);
         _kernel = std::move(k);
     }
     else
     {
-        auto k = arm_compute::cpp14::make_unique<NEBox3x3Kernel>();
+        auto k = arm_compute::support::cpp14::make_unique<NEBox3x3Kernel>();
         k->configure(input, output, border_mode == BorderMode::UNDEFINED);
         _kernel = std::move(k);
     }

diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index 26f31f5..9be1df6 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp

@@ -24,7 +24,6 @@
 #include "arm_compute/runtime/NEON/functions/NECannyEdge.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
@@ -35,14 +34,27 @@
 #include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
 #include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
 
 #include <cstring>
 #include <utility>
 
 using namespace arm_compute;
 
-NECannyEdge::NECannyEdge()
-    : _sobel(), _gradient(), _non_max_suppr(), _edge_trace(), _border_mag_gradient(), _border_edge_trace(), _gx(), _gy(), _magnitude(), _phase(), _nonmax(), _output(nullptr)
+NECannyEdge::NECannyEdge(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _sobel(),
+      _gradient(),
+      _non_max_suppr(),
+      _edge_trace(),
+      _border_mag_gradient(),
+      _border_edge_trace(),
+      _gx(),
+      _gy(),
+      _magnitude(),
+      _phase(),
+      _nonmax(),
+      _output(nullptr)
 {
 }
 
@@ -82,22 +94,26 @@
     _phase.allocator()->init(info);
     _nonmax.allocator()->init(info);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_gx);
+    _memory_group.manage(&_gy);
+
     // Configure/Init sobelNxN
     if(gradient_size == 3)
     {
-        auto k = arm_compute::cpp14::make_unique<NESobel3x3>();
+        auto k = arm_compute::support::cpp14::make_unique<NESobel3x3>();
         k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
         _sobel = std::move(k);
     }
     else if(gradient_size == 5)
     {
-        auto k = arm_compute::cpp14::make_unique<NESobel5x5>();
+        auto k = arm_compute::support::cpp14::make_unique<NESobel5x5>();
         k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
         _sobel = std::move(k);
     }
     else if(gradient_size == 7)
     {
-        auto k = arm_compute::cpp14::make_unique<NESobel7x7>();
+        auto k = arm_compute::support::cpp14::make_unique<NESobel7x7>();
         k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
         _sobel = std::move(k);
     }
@@ -106,20 +122,31 @@
         ARM_COMPUTE_ERROR("Gradient size not supported\n");
     }
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_magnitude);
+    _memory_group.manage(&_phase);
+
     // Configure gradient
     if(use_fp16)
     {
-        auto k = arm_compute::cpp14::make_unique<NEGradientFP16Kernel>();
+        auto k = arm_compute::support::cpp14::make_unique<NEGradientFP16Kernel>();
         k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
         _gradient = std::move(k);
     }
     else
     {
-        auto k = arm_compute::cpp14::make_unique<NEGradientKernel>();
+        auto k = arm_compute::support::cpp14::make_unique<NEGradientKernel>();
         k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
         _gradient = std::move(k);
     }
 
+    // Allocate intermediate tensors
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+
+    // Manage intermediate buffers
+    _memory_group.manage(&_nonmax);
+
     // Configure non-maxima suppression
     _non_max_suppr.configure(&_magnitude, &_phase, &_nonmax, upper_thr, lower_thr, border_mode == BorderMode::UNDEFINED);
 
@@ -127,6 +154,10 @@
     // it. If border mode is undefined filling the border is a nop.
     _border_mag_gradient.configure(&_magnitude, _non_max_suppr.border_size(), border_mode, constant_border_value);
 
+    // Allocate intermediate tensors
+    _phase.allocator()->allocate();
+    _magnitude.allocator()->allocate();
+
     // Configure edge tracing
     _edge_trace.configure(&_nonmax, output);
 
@@ -134,10 +165,6 @@
     _border_edge_trace.configure(&_nonmax, _edge_trace.border_size(), BorderMode::CONSTANT, 0);
 
     // Allocate intermediate tensors
-    _gx.allocator()->allocate();
-    _gy.allocator()->allocate();
-    _phase.allocator()->allocate();
-    _magnitude.allocator()->allocate();
     _nonmax.allocator()->allocate();
 }
 
@@ -146,11 +173,13 @@
     ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
     ARM_COMPUTE_ERROR_ON(_output == nullptr);
 
+    _memory_group.acquire();
+
     // Run sobelNxN
     _sobel->run();
 
     // Fill border before non-maxima suppression. Nop for border mode undefined.
-    _border_mag_gradient.run(_border_mag_gradient.window());
+    NEScheduler::get().schedule(&_border_mag_gradient, Window::DimZ);
 
     // Run gradient
     NEScheduler::get().schedule(_gradient.get(), Window::DimY);
@@ -162,8 +191,10 @@
     memset(_output->buffer(), 0, _output->info()->total_size());
 
     // Fill border before edge trace
-    _border_edge_trace.run(_border_edge_trace.window());
+    NEScheduler::get().schedule(&_border_edge_trace, Window::DimZ);
 
     // Run edge tracing
-    _edge_trace.run(_edge_trace.window());
+    NEScheduler::get().schedule(&_edge_trace, Window::DimY);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEChannelCombine.cpp b/src/runtime/NEON/functions/NEChannelCombine.cpp
index 84d4fff..9166aa9 100644
--- a/src/runtime/NEON/functions/NEChannelCombine.cpp
+++ b/src/runtime/NEON/functions/NEChannelCombine.cpp

@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEChannelCombine.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEChannelCombineKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,14 +32,14 @@
 
 void NEChannelCombine::configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEChannelCombineKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEChannelCombineKernel>();
     k->configure(plane0, plane1, plane2, plane3, output);
     _kernel = std::move(k);
 }
 
 void NEChannelCombine::configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEChannelCombineKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEChannelCombineKernel>();
     k->configure(plane0, plane1, plane2, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NEChannelExtract.cpp b/src/runtime/NEON/functions/NEChannelExtract.cpp
index 634e918..7b8a993 100644
--- a/src/runtime/NEON/functions/NEChannelExtract.cpp
+++ b/src/runtime/NEON/functions/NEChannelExtract.cpp

@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEChannelExtract.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEChannelExtractKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,14 +32,14 @@
 
 void NEChannelExtract::configure(const ITensor *input, Channel channel, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEChannelExtractKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEChannelExtractKernel>();
     k->configure(input, channel, output);
     _kernel = std::move(k);
 }
 
 void NEChannelExtract::configure(const IMultiImage *input, Channel channel, IImage *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEChannelExtractKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEChannelExtractKernel>();
     k->configure(input, channel, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NEColorConvert.cpp b/src/runtime/NEON/functions/NEColorConvert.cpp
index bbaa832..b9fe1ff 100644
--- a/src/runtime/NEON/functions/NEColorConvert.cpp
+++ b/src/runtime/NEON/functions/NEColorConvert.cpp

@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEColorConvert.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,28 +32,28 @@
 
 void NEColorConvert::configure(const ITensor *input, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEColorConvertKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
 
 void NEColorConvert::configure(const IMultiImage *input, IImage *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEColorConvertKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
 
 void NEColorConvert::configure(const IImage *input, IMultiImage *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEColorConvertKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
 
 void NEColorConvert::configure(const IMultiImage *input, IMultiImage *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEColorConvertKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp
index 3f39ae2..f10ffa6 100644
--- a/src/runtime/NEON/functions/NEConvolution.cpp
+++ b/src/runtime/NEON/functions/NEConvolution.cpp

@@ -24,7 +24,6 @@
 #include "arm_compute/runtime/NEON/functions/NEConvolution.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
 #include "arm_compute/core/PixelValue.h"
@@ -33,6 +32,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
 
 #include <array>
 #include <utility>
@@ -41,15 +41,15 @@
 
 void NEConvolution3x3::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<NEConvolution3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEConvolution3x3Kernel>();
     k->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
 
 template <unsigned int matrix_size>
-NEConvolutionSquare<matrix_size>::NEConvolutionSquare()
-    : _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
+NEConvolutionSquare<matrix_size>::NEConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
 {
 }
 
@@ -72,6 +72,10 @@
 
         _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, intermediate_type));
 
+        // Manage intermediate buffers
+        _memory_group.manage(&_tmp);
+
+        // Calculate scale
         if(scale == 0)
         {
             scale = calculate_matrix_scale(conv, matrix_size);
@@ -94,12 +98,16 @@
 template <unsigned int matrix_size>
 void                   NEConvolutionSquare<matrix_size>::run()
 {
-    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
 
     if(_is_separable)
     {
+        _memory_group.acquire();
+
         NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
         NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
+
+        _memory_group.release();
     }
     else
     {
@@ -113,7 +121,7 @@
 
 void NEConvolutionRectangle::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<NEConvolutionRectangleKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEConvolutionRectangleKernel>();
     k->configure(input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));

diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index bd688cf..40862fc 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp

@@ -23,32 +23,41 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
 
+#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
+} // namespace arm_compute
 
 #include <cmath>
 #include <tuple>
 
-using namespace arm_compute;
-
-NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights()
-    : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
+namespace arm_compute
+{
+NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
 {
 }
 
 void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose1xW)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
 
     if(biases != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
         ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases);
         ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
@@ -69,8 +78,11 @@
         TensorInfo         info_wr(shape_wr, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
 
         _weights_reshaped.allocator()->init(info_wr);
+        _memory_group.manage(&_weights_reshaped);
+
         _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
         _weights_transposed_kernel.configure(&_weights_reshaped, output);
+
         _weights_reshaped.allocator()->allocate();
     }
     else
@@ -81,32 +93,34 @@
 
 void NEConvolutionLayerReshapeWeights::run()
 {
+    _memory_group.acquire();
+
     NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+
     if(_transpose1xW)
     {
         NEScheduler::get().schedule(&_weights_transposed_kernel, Window::DimY);
     }
+
+    _memory_group.release();
 }
 
-NEConvolutionLayer::NEConvolutionLayer()
-    : _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(),
-      _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_optimised_kernel(nullptr), _output_col2im_kernel(),
+      _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _workspace(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
 {
 }
 
 void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
     ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
 
     if(biases != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
         ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
         ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
@@ -131,94 +145,165 @@
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
 
-    const unsigned int kernel_width = (_are_weights_reshaped) ? weights_info.kernel_size() : weights->info()->dimension(0);
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width,
-                                                 stride_x, stride_y, pad_x, pad_y, conv_info.round());
-    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+    const unsigned int kernel_width  = (_are_weights_reshaped) ? weights_info.kernel_size().first : weights->info()->dimension(0);
+    const unsigned int kernel_height = (_are_weights_reshaped) ? weights_info.kernel_size().second : weights->info()->dimension(1);
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
+                                                 conv_info);
 
-    // Check if its a "fully connected" convolution
+    // Check if its a "fully connected" convolution, i.e. the output size is 1x1xnum_kernels
     _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
 
+#if defined(__arm__)
+    if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32)
+    {
+        _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
+    }
+#elif defined(__aarch64__)
+    if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32)
+    {
+        _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
+    }
+#endif /* defined(__arm__) || defined(__aarch64__) */
+
     unsigned int mat_weights_cols = weights->info()->dimension(3);
     unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
 
     // Reshape weights if needed
-    if(_are_weights_reshaped)
+    if(_mm_optimised_kernel != nullptr)
     {
-        mat_weights_cols                         = output->info()->dimension(2);
-        const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
-        mat_weights_rows                         = (_has_bias ? 1 + quarter_reshaped_cols : quarter_reshaped_cols);
-    }
-    else
-    {
-        if(_is_fully_connected_convolution)
+        if(_are_weights_reshaped)
         {
-            // Create tensor to store the reshaped weights
-            TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
-            TensorInfo  info_wr(shape_wr, 1, dt, fixed_point_position);
-            _weights_reshaped.allocator()->init(info_wr);
-            _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
+            mat_weights_cols = weights_info.num_kernels();
+            mat_weights_rows = weights->info()->dimension(1);
         }
         else
         {
-            // Create tensor to store transposed weights
-            const float transpose_width = 16.0f / input->info()->element_size();
-            TensorShape shape_wt(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
-            TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
-            _weights_reshaped.allocator()->init(info_wt);
-            _reshape_weights.configure(weights, biases, &_weights_reshaped, true /* 1xW transpose */);
+            TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
+
+            // Create tensor to store the reshaped weights
+            _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
+            weights = &_weights_reshaped;
         }
-        weights = &_weights_reshaped;
+    }
+    else
+    {
+        if(_are_weights_reshaped)
+        {
+            mat_weights_cols = weights_info.num_kernels();
+            mat_weights_rows = weights->info()->dimension(0) / 4 + (_has_bias ? 1 : 0);
+        }
+        else
+        {
+            TensorShape reshaped_weights_shape;
+
+            if(_is_fully_connected_convolution)
+            {
+                reshaped_weights_shape = TensorShape{ mat_weights_cols, mat_weights_rows };
+            }
+            else
+            {
+                // Create tensor to store transposed weights
+                const float transpose_width = 16.0f / input->info()->element_size();
+                reshaped_weights_shape      = TensorShape{ mat_weights_rows *static_cast<unsigned int>(transpose_width),
+                                                           static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)) };
+            }
+
+            // Create tensor to store the reshaped weights
+            _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, !_is_fully_connected_convolution /* 1xW transpose */);
+            weights = &_weights_reshaped;
+        }
     }
 
     // Create tensor to store im2col reshaped inputs
     const unsigned int mat_input_cols = mat_weights_rows;
     const unsigned int mat_input_rows = conv_w * conv_h;
-    TensorShape        shape_im2col   = input->info()->tensor_shape();
+
+    TensorShape shape_im2col(input->info()->tensor_shape());
     shape_im2col.set(0, mat_input_cols);
     shape_im2col.set(1, mat_input_rows);
     shape_im2col.set(2, 1);
     _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+    _memory_group.manage(&_input_im2col_reshaped);
 
     // Create tensor (interleave) to prepare input tensor for GEMM
-    if(!_is_fully_connected_convolution)
+    if(!_is_fully_connected_convolution && _mm_optimised_kernel == nullptr)
     {
-        TensorShape shape_interleaved = shape_im2col;
+        TensorShape shape_interleaved(shape_im2col);
         shape_interleaved.set(0, shape_interleaved.x() * 4);
         shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
         _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+        _memory_group.manage(&_input_interleaved_reshaped);
     }
 
     // Create GEMM output tensor
-    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+    TensorShape shape_gemm(_input_im2col_reshaped.info()->tensor_shape());
     shape_gemm.set(0, mat_weights_cols);
     shape_gemm.set(1, mat_input_rows);
     _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, dt, fixed_point_position));
+    _memory_group.manage(&_gemm_output);
 
     // Configure kernels
-    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
-    if(_is_fully_connected_convolution)
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
+
+#if defined(__arm__) || defined(__aarch64__)
+    if(_mm_optimised_kernel != nullptr)
     {
-        _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
+        struct CPUInfo ci = NEScheduler::get().cpu_info();
+
+        const int M = _gemm_output.info()->tensor_shape().y();
+        const int N = _gemm_output.info()->tensor_shape().x();
+        const int K = _input_im2col_reshaped.info()->tensor_shape().x();
+
+#if defined(__arm__)
+        GemmInterleaved<sgemm_8x6, float, float> gemm(&ci, M, N, K, false, false);
+#elif defined(__aarch64__)
+        GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
+#endif /* defined(__arm__) || defined(__aarch64__) */
+
+        constexpr size_t alignment = 4096;
+        _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+        _memory_group.manage(&_workspace);
+
+        // Configure matrix multiplication kernel
+        if(_is_fully_connected_convolution)
+        {
+            _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace, 1.f, 0.f, false, false);
+        }
+        else
+        {
+            _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace);
+        }
+
+        _workspace.allocator()->allocate();
     }
     else
+#endif /* defined(__arm__) || defined(__aarch64__) */
     {
-        _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
-        _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
+        if(_is_fully_connected_convolution)
+        {
+            _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
+        }
+        else
+        {
+            _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
+            _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
+            _input_interleaved_reshaped.allocator()->allocate();
+        }
     }
+
+    _input_im2col_reshaped.allocator()->allocate();
     _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+    _gemm_output.allocator()->allocate();
+
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
 
     // Allocate intermediate tensor
     if(!_are_weights_reshaped)
     {
         _weights_reshaped.allocator()->allocate();
     }
-    _input_im2col_reshaped.allocator()->allocate();
-    if(!_is_fully_connected_convolution)
-    {
-        _input_interleaved_reshaped.allocator()->allocate();
-    }
-    _gemm_output.allocator()->allocate();
 }
 
 void NEConvolutionLayer::run()
@@ -230,17 +315,30 @@
         _reshape_weights.run();
     }
 
+    _memory_group.acquire();
+
     // Run input reshaping
     NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
-    if(!_is_fully_connected_convolution)
-    {
-        // Run interleave
-        NEScheduler::get().schedule(&_input_interleave_kernel, Window::DimY);
-    }
 
     // Runs matrix multiply on reshaped matrices
-    NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+    if(_mm_optimised_kernel != nullptr)
+    {
+        NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
+    }
+    else
+    {
+        if(!_is_fully_connected_convolution)
+        {
+            // Run interleave
+            NEScheduler::get().schedule(&_input_interleave_kernel, Window::DimY);
+        }
+
+        NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+    }
 
     // Reshape output matrix
     NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+
+    _memory_group.release();
 }
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEDepthConcatenate.cpp b/src/runtime/NEON/functions/NEDepthConcatenate.cpp
index 7d2c549..ddf7e90 100644
--- a/src/runtime/NEON/functions/NEDepthConcatenate.cpp
+++ b/src/runtime/NEON/functions/NEDepthConcatenate.cpp

@@ -24,28 +24,29 @@
 #include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-NEDepthConcatenate::NEDepthConcatenate()
-    : _inputs_vector(), _concat_kernels_vector(), _border_handlers_vector(), _num_inputs(0)
+NEDepthConcatenate::NEDepthConcatenate() // NOLINT
+    : _inputs_vector(),
+      _concat_kernels_vector(),
+      _border_handlers_vector(),
+      _num_inputs(0)
 {
 }
 
-void NEDepthConcatenate::configure(std::vector<ITensor *> inputs_vector, ITensor *output)
+void NEDepthConcatenate::configure(std::vector<ITensor *> inputs_vector, ITensor *output) // NOLINT
 {
     ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
 
     _num_inputs             = inputs_vector.size();
-    _concat_kernels_vector  = arm_compute::cpp14::make_unique<NEDepthConcatenateKernel[]>(_num_inputs);
-    _border_handlers_vector = arm_compute::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
+    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<NEDepthConcatenateKernel[]>(_num_inputs);
+    _border_handlers_vector = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
 
     unsigned int depth_offset = 0;
     for(unsigned int i = 0; i < _num_inputs; ++i)

diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/NEON/functions/NEDepthConvert.cpp
index a339cae..37857b6 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvert.cpp

@@ -23,22 +23,16 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
 
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
 using namespace arm_compute;
 
-void NEDepthConvert::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(input == output);
-    ARM_COMPUTE_ERROR_ON(input->info()->data_type() == output->info()->data_type());
-
-    auto k = arm_compute::cpp14::make_unique<NEDepthConvertKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
     k->configure(input, output, policy, shift);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
new file mode 100644
index 0000000..a58b6e4
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp

@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEDequantizationLayer::NEDequantizationLayer()
+    : _dequantize_kernel()
+{
+}
+
+void NEDequantizationLayer::configure(const ITensor *input, ITensor *output, const ITensor *min_max)
+{
+    // Configure kernel
+    _dequantize_kernel.configure(input, output, min_max);
+}
+
+void NEDequantizationLayer::run()
+{
+    NEScheduler::get().schedule(&_dequantize_kernel, Window::DimY);
+}
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEDerivative.cpp b/src/runtime/NEON/functions/NEDerivative.cpp
index 2887c13..8118030 100644
--- a/src/runtime/NEON/functions/NEDerivative.cpp
+++ b/src/runtime/NEON/functions/NEDerivative.cpp

@@ -42,11 +42,11 @@
     ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
 
     _kernel.configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }
 
 void NEDerivative::run()
 {
-    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
     NEScheduler::get().schedule(&_kernel, Window::DimY);
 }

diff --git a/src/runtime/NEON/functions/NEDilate.cpp b/src/runtime/NEON/functions/NEDilate.cpp
index 0c016f1..5c733a8 100644
--- a/src/runtime/NEON/functions/NEDilate.cpp
+++ b/src/runtime/NEON/functions/NEDilate.cpp

@@ -23,9 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEDilate.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEDilateKernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void NEDilate::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<NEDilateKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEDilateKernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));

diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 3f3e771..b831a6a 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp

@@ -33,15 +33,13 @@
 
 using namespace arm_compute;
 
-NEDirectConvolutionLayer::NEDirectConvolutionLayer()
-    : _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator()
+NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator()
 {
 }
 
 void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
-
     // Free accumulator
     if(_accumulator.buffer() != nullptr)
     {
@@ -49,17 +47,38 @@
     }
 
     // Allocate the intermediate accumulator tensor in case of fixed point input
-    if(output->info()->data_type() == DataType::QS8)
+    switch(output->info()->data_type())
     {
-        _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position()));
-        _conv_kernel.configure(input, weights, &_accumulator, conv_info);
-        _accumulate_bias_kernel.configure(&_accumulator, bias, output);
-        _accumulator.allocator()->allocate();
-    }
-    else
-    {
-        _conv_kernel.configure(input, weights, output, conv_info);
-        _accumulate_bias_kernel.configure(output, bias);
+        case DataType::QS8:
+        {
+            _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position()));
+            _memory_group.manage(&_accumulator);
+            _conv_kernel.configure(input, weights, &_accumulator, conv_info);
+            _accumulate_bias_kernel.configure(&_accumulator, bias, output);
+            _accumulator.allocator()->allocate();
+            break;
+        }
+        case DataType::QS16:
+        {
+            _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS32, output->info()->fixed_point_position()));
+            _memory_group.manage(&_accumulator);
+            _conv_kernel.configure(input, weights, &_accumulator, conv_info);
+            _accumulate_bias_kernel.configure(&_accumulator, bias, output);
+            _accumulator.allocator()->allocate();
+            break;
+        }
+        case DataType::F16:
+        case DataType::F32:
+        {
+            _conv_kernel.configure(input, weights, output, conv_info);
+            _accumulate_bias_kernel.configure(output, bias);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Data type not supported");
+            break;
+        }
     }
 
     // Add zero padding XY
@@ -68,8 +87,12 @@
 
 void NEDirectConvolutionLayer::run()
 {
-    _input_border_handler.run(_input_border_handler.window());
+    NEScheduler::get().schedule(&_input_border_handler, Window::DimZ);
+
+    _memory_group.acquire();
 
     NEScheduler::get().schedule(&_conv_kernel, Window::DimZ);
     NEScheduler::get().schedule(&_accumulate_bias_kernel, Window::DimY);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
index f6ec677..70b93ca 100644
--- a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
+++ b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp

@@ -55,7 +55,7 @@
     NEScheduler::get().schedule(&_histogram_kernel, Window::DimY);
 
     // Calculate cumulative distribution of histogram and create LUT.
-    _cd_histogram_kernel.run(_cd_histogram_kernel.window());
+    NEScheduler::get().schedule(&_cd_histogram_kernel, Window::DimY);
 
     // Map input to output using created LUT.
     NEScheduler::get().schedule(&_map_histogram_kernel, Window::DimY);

diff --git a/src/runtime/NEON/functions/NEErode.cpp b/src/runtime/NEON/functions/NEErode.cpp
index 9b011db..3609572 100644
--- a/src/runtime/NEON/functions/NEErode.cpp
+++ b/src/runtime/NEON/functions/NEErode.cpp

@@ -23,9 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEErode.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEErodeKernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void NEErode::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<NEErodeKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEErodeKernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));

diff --git a/src/runtime/NEON/functions/NEFastCorners.cpp b/src/runtime/NEON/functions/NEFastCorners.cpp
index 33a58f1..4137b1d 100644
--- a/src/runtime/NEON/functions/NEFastCorners.cpp
+++ b/src/runtime/NEON/functions/NEFastCorners.cpp

@@ -35,8 +35,9 @@
 
 using namespace arm_compute;
 
-NEFastCorners::NEFastCorners()
-    : _fast_corners_kernel(),
+NEFastCorners::NEFastCorners(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)),
+      _fast_corners_kernel(),
       _border_handler(),
       _nonmax_kernel(),
       _fill_kernel(),
@@ -59,6 +60,7 @@
 
     TensorInfo tensor_info(input->info()->tensor_shape(), Format::U8);
     _output.allocator()->init(tensor_info);
+    _memory_group.manage(&_output);
 
     // If border is UNDEFINED _fast_corners_kernel will operate in xwindow (3,
     // width - 3) and ywindow (3, height -3) so the output image will leave the
@@ -75,6 +77,7 @@
     else
     {
         _suppressed.allocator()->init(tensor_info);
+        _memory_group.manage(&_suppressed);
         _nonmax_kernel.configure(&_output, &_suppressed, BorderMode::UNDEFINED == border_mode);
         _fill_kernel.configure(&_suppressed, 1 /* we keep all texels >0 */, corners);
 
@@ -88,7 +91,9 @@
 
 void NEFastCorners::run()
 {
-    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+
+    _memory_group.acquire();
 
     NEScheduler::get().schedule(&_fast_corners_kernel, Window::DimY);
 
@@ -98,4 +103,6 @@
     }
 
     NEScheduler::get().schedule(&_fill_kernel, Window::DimY);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp
index e884f4a..44e4952 100644
--- a/src/runtime/NEON/functions/NEFillBorder.cpp
+++ b/src/runtime/NEON/functions/NEFillBorder.cpp

@@ -30,7 +30,7 @@
 
 void NEFillBorder::configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
 {
-    _border_handler.configure(input, border_width, border_mode, constant_border_value);
+    _border_handler.configure(input, BorderSize(border_width), border_mode, constant_border_value);
 }
 
 void NEFillBorder::run()

diff --git a/src/runtime/NEON/functions/NEFloor.cpp b/src/runtime/NEON/functions/NEFloor.cpp
new file mode 100644
index 0000000..0000cdd
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFloor.cpp

@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFloor.h"
+
+#include "arm_compute/core/NEON/kernels/NEFloorKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void NEFloor::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEFloorKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}

diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index abb41e9..2e8d105 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp

@@ -23,27 +23,28 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 
+#include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include <algorithm>
 #include <cmath>
 
-using namespace arm_compute;
-
-NEFullyConnectedLayerReshapeWeights::NEFullyConnectedLayerReshapeWeights()
-    : _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
+namespace arm_compute
+{
+NEFullyConnectedLayerReshapeWeights::NEFullyConnectedLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
 {
 }
 
 void NEFullyConnectedLayerReshapeWeights::configure(const ITensor *input, ITensor *output, bool transpose_weights, bool is_batched_fc_layer)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 2);
     ARM_COMPUTE_ERROR_ON(output == nullptr);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != 2);
-    ARM_COMPUTE_ERROR_ON((transpose_weights == false) && (is_batched_fc_layer == false));
+    ARM_COMPUTE_ERROR_ON(!transpose_weights && !is_batched_fc_layer);
 
-    const DataType dt                   = input->info()->data_type();
+    const DataType data_type            = input->info()->data_type();
     const int      fixed_point_position = input->info()->fixed_point_position();
 
     _transpose_weights   = transpose_weights;
@@ -56,7 +57,8 @@
         {
             // Initialize the output tensor for transpose
             TensorShape shape_transposed(input->info()->dimension(1), input->info()->dimension(0));
-            _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, dt, fixed_point_position));
+            _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, data_type, fixed_point_position));
+            _memory_group.manage(&_transpose_output);
             _transpose_kernel.configure(input, &_transpose_output);
 
             // Configure transpose 1xW kernel
@@ -86,229 +88,161 @@
 
 void NEFullyConnectedLayerReshapeWeights::run()
 {
+    _memory_group.acquire();
+
     if(_transpose_weights)
     {
         NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
     }
+
     if(_is_batched_fc_layer)
     {
         NEScheduler::get().schedule(&_transpose1xW_kernel, Window::DimY);
     }
+
+    _memory_group.release();
 }
 
-NEFullyConnectedLayer::NEFullyConnectedLayer()
-    : _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _reshape_weights_output(),
-      _are_weights_reshaped(false), _is_fc_after_conv(false), _is_batched_fc_layer(false), _accumulate_biases(false)
+NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(),
+      _reshape_weights_output(), _are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false)
 {
 }
 
-void NEFullyConnectedLayer::configure_conv_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2) * (16 / weights->info()->element_size())));
-
-    const DataType dt                   = input->info()->data_type();
-    const int      fixed_point_position = input->info()->fixed_point_position();
-
-    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
-
-    // Initialize output tensor for im2col
-    TensorShape shape_im2col;
-    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
-    shape_im2col.set(1, input->info()->dimension(3));
-    shape_im2col.set(2, input->info()->dimension(4));
-    shape_im2col.set(3, input->info()->dimension(5));
-    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
-
-    // Initialize output tensor for interleave 4x4
-    TensorShape shape_interleaved = _im2col_output.info()->tensor_shape();
-    shape_interleaved.set(0, shape_interleaved.x() * 4);
-    shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
-    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
-
-    // Configure im2col kernel
-    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
-
-    // Configure interleave4x4 kernel
-    _interleave4x4_kernel.configure(&_im2col_output, &_interleave4x4_output);
-
-    // Configure matrix multiply kernel
-    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
-
-    // Allocate the tensors once all the configure methods have been called
-    _im2col_output.allocator()->allocate();
-    _interleave4x4_output.allocator()->allocate();
-}
-
-void NEFullyConnectedLayer::configure_fc_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output)
-{
-    const DataType dt                   = input->info()->data_type();
-    const int      fixed_point_position = input->info()->fixed_point_position();
-
-    // Initialize output tensor for interleave 4x4
-    TensorShape shape_interleaved = input->info()->tensor_shape();
-    shape_interleaved.set(0, shape_interleaved.x() * 4);
-    shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
-    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
-
-    // Configure interleave4x4 kernel
-    _interleave4x4_kernel.configure(input, &_interleave4x4_output);
-
-    // Configure matrix multiply kernel
-    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
-
-    // Allocate the tensors once all the configure methods have been called
-    _interleave4x4_output.allocator()->allocate();
-}
-
-void NEFullyConnectedLayer::configure_conv_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
-
-    const DataType dt                   = input->info()->data_type();
-    const int      fixed_point_position = input->info()->fixed_point_position();
-
-    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
-
-    // Initialize output tensor for im2col
-    TensorShape shape_im2col;
-    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
-    shape_im2col.set(1, 1);
-    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
-
-    // Configure im2col kernel
-    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
-
-    // Configure matrix multiply kernel
-    _mm_kernel.configure(&_im2col_output, weights, output, 1.0f);
-
-    // Allocate the output tensor for im2col once all the configure methods have been called
-    _im2col_output.allocator()->allocate();
-}
-
-void NEFullyConnectedLayer::configure_fc_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
-
-    // Configure matrix multiply kernel
-    _mm_kernel.configure(input, weights, output, 1.0f);
-}
-
 void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights, bool are_weights_reshaped)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
-    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
-
-    const DataType dt                   = input->info()->data_type();
-    const int      fixed_point_position = input->info()->fixed_point_position();
-
-    _are_weights_reshaped = are_weights_reshaped;
-    _is_fc_after_conv     = true;
-    _is_batched_fc_layer  = false;
-    _accumulate_biases    = false;
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-
-        _accumulate_biases = true;
-
-        // Configure accumulate biases kernel
-        _accumulate_biases_kernel.configure(output, biases);
-    }
-
     // With the Fully Connected layer we can have 4 different cases:
     //  1) Convolution layer -> Fully Connected layer without batches
     //  2) Fully Connected layer -> Fully Connected layer without batches
     //  3) Convolution layer -> Fully Connected layer with batches
     //  4) Fully Connected layer -> Fully Connected layer with batches
 
-    // Check if we have a fully connected layer with batches
-    _is_batched_fc_layer = (output->info()->dimension(1) > 1);
+    // Expected shape before transpose and reshaping
+    // Input: In x B (In and B can be multi-dimensional)
+    // Weights: flat(In) x Out
+    // Biases: Out
+    // Output: Out x B (B can be multi-dimensional)
 
-    const ITensor *weights_to_use = weights;
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, weights, output);
 
-    if(!are_weights_reshaped)
+    const DataType data_type            = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+    const int      num_batch_dimensions = std::max(0, static_cast<int>(output->info()->tensor_shape().num_dimensions()) - 1);
+    const int      num_input_dimensions = input->info()->tensor_shape().num_dimensions() - num_batch_dimensions;
+    const size_t   linear_input_size    = input->info()->tensor_shape().total_size_lower(num_input_dimensions);
+
+    _linearize_input      = input->info()->tensor_shape().x() != linear_input_size;
+    _are_weights_reshaped = are_weights_reshaped;
+    _accumulate_biases    = biases != nullptr;
+    _is_batched_fc_layer  = num_batch_dimensions > 0;
+
+    // Check if number of batches match
+    ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().total_size_upper(num_input_dimensions) != output->info()->tensor_shape().total_size_upper(1));
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 2);
+
+    const size_t   interleave_width = 16 / input->info()->element_size();
+    const ITensor *weights_to_use   = weights;
+
+    if(!are_weights_reshaped && (transpose_weights || _is_batched_fc_layer))
     {
-        if((transpose_weights || _is_batched_fc_layer))
+        weights_to_use = &_reshape_weights_output;
+
+        TensorShape reshaped_weights_shape(weights->info()->tensor_shape());
+
+        // Transpose weights if the user hasn't done it
+        if(transpose_weights)
         {
-            weights_to_use = &_reshape_weights_output;
-
-            if(transpose_weights)
-            {
-                if(_is_batched_fc_layer)
-                {
-                    const float transpose_width = 16.0f / input->info()->element_size();
-                    TensorShape shape_wt(weights->info()->dimension(0) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(1) / transpose_width)));
-                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
-                    _reshape_weights_output.allocator()->init(info_wt);
-                }
-                else
-                {
-                    TensorShape shape_wt(weights->info()->dimension(1), weights->info()->dimension(0));
-                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
-                    _reshape_weights_output.allocator()->init(info_wt);
-                }
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR_ON(!_is_batched_fc_layer);
-
-                const float transpose_width = 16.0f / input->info()->element_size();
-                TensorShape shape_wt(weights->info()->dimension(1) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(0) / transpose_width)));
-                TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
-                _reshape_weights_output.allocator()->init(info_wt);
-            }
-
-            // Reshape the weights
-            _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
+            const size_t shape_x = reshaped_weights_shape.x();
+            reshaped_weights_shape.set(0, reshaped_weights_shape.y());
+            reshaped_weights_shape.set(1, shape_x);
         }
+
+        // If the we run multiple batches we need 1xW transpose, too.
+        if(_is_batched_fc_layer)
+        {
+            const float shape_x = reshaped_weights_shape.x();
+            reshaped_weights_shape.set(0, reshaped_weights_shape.y() * interleave_width);
+            reshaped_weights_shape.set(1, static_cast<unsigned int>(std::ceil(shape_x / interleave_width)));
+        }
+
+        _reshape_weights_output.allocator()->init(TensorInfo(reshaped_weights_shape, 1, data_type, fixed_point_position));
+
+        // Reshape the weights
+        _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
+    }
+
+    // Check correct shape of weights
+    if(_is_batched_fc_layer)
+    {
+        // Transpose + Transpose1xW
+        ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().x() != linear_input_size * interleave_width);
+        ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().y() != static_cast<unsigned int>(std::ceil(static_cast<float>(output->info()->tensor_shape().x()) / interleave_width)));
+    }
+    else
+    {
+        // Transpose
+        ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().x() != output->info()->tensor_shape().x());
+        ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().y() != linear_input_size);
+    }
+
+    const ITensor *multiply_input = input;
+
+    if(_linearize_input)
+    {
+        TensorShape shape_im2col(input->info()->tensor_shape());
+        shape_im2col.collapse(num_input_dimensions);
+        _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, data_type, fixed_point_position));
+
+        // Configure im2col kernel
+        _memory_group.manage(&_im2col_output);
+        _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+
+        multiply_input = &_im2col_output;
     }
 
     if(_is_batched_fc_layer)
     {
-        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
-                                                                                  input->info()->tensor_shape().cend(),
-                                                                                  output->info()->tensor_shape().cbegin() + 1));
+        TensorShape shape_interleaved(multiply_input->info()->tensor_shape());
+        shape_interleaved.set(0, shape_interleaved.x() * 4);
+        shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
+        _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, data_type, fixed_point_position));
 
-        if(_is_fc_after_conv)
-        {
-            // Fully Connected layer after a Convolution Layer with batches
-            configure_conv_fc_wb(input, weights_to_use, output);
-        }
-        else
-        {
-            // Fully Connected layer after a Fully Connected Layer with batches
-            configure_fc_fc_wb(input, weights_to_use, output);
-        }
+        // Configure interleave4x4 kernel
+        _memory_group.manage(&_interleave4x4_output);
+        _interleave4x4_kernel.configure(multiply_input, &_interleave4x4_output);
+
+        multiply_input = &_interleave4x4_output;
     }
-    else
-    {
-        // In case of not batched fully connected layer, the weights will not be reshaped using transposed1xW
-        _is_fc_after_conv = ((weights_to_use->info()->dimension(1)) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
 
-        if(_is_fc_after_conv)
-        {
-            // Fully Connected layer after a Convolution Layer without batches
-            configure_conv_fc_nb(input, weights_to_use, output);
-        }
-        else
-        {
-            // Fully Connected layer after a Fully Connected Layer without batches
-            configure_fc_fc_nb(input, weights_to_use, output);
-        }
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(multiply_input, weights_to_use, output, 1.0f);
+
+    if(_accumulate_biases)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->tensor_shape().x() != output->info()->tensor_shape().x());
+
+        // Configure accumulate biases kernel
+        _accumulate_biases_kernel.configure(output, biases);
     }
 
     // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
-    if(!are_weights_reshaped)
+    if(!are_weights_reshaped && (transpose_weights || _is_batched_fc_layer))
     {
-        if(transpose_weights || _is_batched_fc_layer)
-        {
-            // Allocate the tensor for the weights reshaped
-            _reshape_weights_output.allocator()->allocate();
-        }
+        // Allocate the tensor for the weights reshaped
+        _reshape_weights_output.allocator()->allocate();
+    }
+
+    if(_linearize_input)
+    {
+        _im2col_output.allocator()->allocate();
+    }
+
+    if(_is_batched_fc_layer)
+    {
+        _interleave4x4_output.allocator()->allocate();
     }
 }
 
@@ -321,8 +255,10 @@
         _reshape_weights_kernel.run();
     }
 
-    // Linearize input if comes from a convolutional layer
-    if(_is_fc_after_conv)
+    _memory_group.acquire();
+
+    // Linearize input if it comes from a convolutional layer
+    if(_linearize_input)
     {
         NEScheduler::get().schedule(&_im2col_kernel, Window::DimY);
     }
@@ -341,4 +277,7 @@
     {
         NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
     }
+
+    _memory_group.release();
 }
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 15d5f4e..ff92ef8 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp

@@ -26,30 +26,41 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
+} // namespace arm_compute
 
 #include <cmath>
 
-using namespace arm_compute;
-
-NEGEMM::NEGEMM()
-    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _run_vector_matrix_multiplication(false), _run_addition(false)
+namespace arm_compute
+{
+NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _mm_optimised_kernel(nullptr), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(),
+      _run_vector_matrix_multiplication(false), _run_addition(false)
 {
 }
 
 void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16, DataType::QS8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::F32, DataType::F16, DataType::QS8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(d, 1, DataType::F32, DataType::F16, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16, DataType::QS8, DataType::QS16);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, d);
+    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
 
     if(c != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16, DataType::QS8);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16, DataType::QS8, DataType::QS16);
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
         ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
         ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
@@ -57,100 +68,135 @@
         ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != d->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
     }
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, d);
-    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
 
-    // Check if the first input tensor is a vector. If so, all the kernels for reshaping the tensors can be skipped
-    if((a->info()->dimension(1) == 1))
+    // Check if the first input tensor is a vector.
+    // If so, all the kernels for reshaping the tensors can be skipped
+    if(_run_vector_matrix_multiplication)
     {
-        _run_vector_matrix_multiplication = true;
-
         // Configure the matrix multiply kernel
         _mm_kernel.configure(a, b, d, alpha);
+
+        // Configure matrix addition kernel
+        if(beta != 0 && c != nullptr)
+        {
+            _ma_kernel.configure(c, d, beta);
+            _run_addition = true;
+        }
     }
     else
     {
-        _run_vector_matrix_multiplication = false;
-
-        TensorShape shape_tmp_a = a->info()->tensor_shape();
-        TensorShape shape_tmp_b = b->info()->tensor_shape();
-
-        shape_tmp_a.set(0, a->info()->dimension(0) * 4);
-        shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
-
-        switch(a->info()->data_type())
+#if defined(__arm__)
+        if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
         {
-            case DataType::F32:
+            _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
+        }
+#elif defined(__aarch64__)
+        if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
+        {
+            _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
+        }
+#endif /* defined(__arm__) || defined(__aarch64__) */
+
+#if defined(__arm__) || defined(__aarch64__)
+        if(_mm_optimised_kernel != nullptr)
+        {
+            struct CPUInfo ci = NEScheduler::get().cpu_info();
+
+            const int M = d->info()->tensor_shape().y();
+            const int N = d->info()->tensor_shape().x();
+            const int K = a->info()->tensor_shape().x();
+
+#if defined(__arm__)
+            GemmInterleaved<sgemm_8x6, float, float> gemm(&ci, M, N, K, false, false);
+#elif defined(__aarch64__)
+            GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
+#endif /* defined(__arm__) || defined(__aarch64__) */
+
+            constexpr size_t alignment = 4096;
+            _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+            _memory_group.manage(&_workspace);
+
+            // Configure matrix multiplication kernel
+            _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f);
+
+            _workspace.allocator()->allocate();
+        }
+        else
+#endif /* defined(__arm__) || defined(__aarch64__) */
+        {
+            TensorShape shape_tmp_a = a->info()->tensor_shape();
+            TensorShape shape_tmp_b = b->info()->tensor_shape();
+
+            shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+            shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
+
+            const unsigned int transpose_w = 16 / data_size_from_type(b->info()->data_type());
+            shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
+            shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
+
+            TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
+            TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), a->info()->fixed_point_position());
+
+            _tmp_a.allocator()->init(info_a);
+            _tmp_b.allocator()->init(info_b);
+
+            // Manage intermediate buffers
+            _memory_group.manage(&_tmp_a);
+            _memory_group.manage(&_tmp_b);
+
+            // Configure interleave kernel
+            _interleave_kernel.configure(a, &_tmp_a);
+
+            // Configure transpose kernel
+            _transpose_kernel.configure(b, &_tmp_b);
+
+            // Configure matrix multiplication kernel
+            _mm_kernel.configure(&_tmp_a, &_tmp_b, d, alpha);
+
+            // Allocate once the all configure methods have been called
+            _tmp_a.allocator()->allocate();
+            _tmp_b.allocator()->allocate();
+
+            // Configure matrix addition kernel
+            if(beta != 0 && c != nullptr)
             {
-                shape_tmp_b.set(0, b->info()->dimension(1) * 4);
-                shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 4.0f));
-                break;
-            }
-            case DataType::F16:
-#ifdef ARM_COMPUTE_ENABLE_FP16
-                {
-                    shape_tmp_b.set(0, b->info()->dimension(1) * 8);
-                    shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 8.0f));
-                    break;
-                }
-#endif
-            case DataType::QS8:
-            {
-                shape_tmp_b.set(0, b->info()->dimension(1) * 16);
-                shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.0f));
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR_ON("Data type not supported");
+                _ma_kernel.configure(c, d, beta);
+                _run_addition = true;
             }
         }
-
-        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
-        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), a->info()->fixed_point_position());
-
-        _tmp_a.allocator()->init(info_a);
-        _tmp_b.allocator()->init(info_b);
-
-        // Configure interleave kernel
-        _interleave_kernel.configure(a, &_tmp_a);
-
-        // Configure transpose kernel
-        _transpose_kernel.configure(b, &_tmp_b);
-
-        // Configure matrix multiplication kernel
-        _mm_kernel.configure(&_tmp_a, &_tmp_b, d, alpha);
-
-        // Allocate once the all configure methods have been called
-        _tmp_a.allocator()->allocate();
-        _tmp_b.allocator()->allocate();
-    }
-
-    // Configure matrix addition kernel
-    if(beta != 0 && c != nullptr)
-    {
-        _ma_kernel.configure(c, d, beta);
-        _run_addition = true;
     }
 }
 
 void NEGEMM::run()
 {
-    if(!_run_vector_matrix_multiplication)
-    {
-        // Run interleave kernel
-        NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
+    _memory_group.acquire();
 
-        // Run transpose kernel
-        NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+    if(_mm_optimised_kernel != nullptr)
+    {
+        NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
+        _memory_group.release();
     }
-
-    // Run matrix multiply kernel
-    NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
-
-    // Run matrix addition kernel
-    if(_run_addition)
+    else
     {
-        NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
+        if(!_run_vector_matrix_multiplication)
+        {
+            // Run interleave kernel
+            NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
+
+            // Run transpose kernel
+            NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+        }
+
+        NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
+
+        _memory_group.release();
+
+        // Run matrix addition kernel
+        if(_run_addition)
+        {
+            NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
+        }
     }
 }
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
index 4c77c88..63f330b 100644
--- a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
+++ b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp

@@ -23,14 +23,14 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 void NEGEMMInterleave4x4::configure(const ITensor *input, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NEGEMMLowp.cpp b/src/runtime/NEON/functions/NEGEMMLowp.cpp
index b64f769..7413b28 100644
--- a/src/runtime/NEON/functions/NEGEMMLowp.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowp.cpp

@@ -34,8 +34,8 @@
 
 using namespace arm_compute;
 
-NEGEMMLowp::NEGEMMLowp()
-    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
+NEGEMMLowp::NEGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
 {
 }
 
@@ -63,6 +63,10 @@
     _tmp_a.allocator()->init(info_a);
     _tmp_b.allocator()->init(info_b);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_tmp_a);
+    _memory_group.manage(&_tmp_b);
+
     _interleave_kernel.configure(a, &_tmp_a);
     _transpose_kernel.configure(b, &_tmp_b);
     _mm_kernel.configure(&_tmp_a, &_tmp_b, output, a_offset, b_offset, output_offset, output_mult_int, shift);
@@ -73,6 +77,8 @@
 
 void NEGEMMLowp::run()
 {
+    _memory_group.acquire();
+
     /* Run interleave kernel */
     NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
 
@@ -81,4 +87,6 @@
 
     /* Run matrix multiply kernel */
     NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
index dc40ece..571bf2b 100644
--- a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
+++ b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp

@@ -24,17 +24,17 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 void NEGEMMTranspose1xW::configure(const ITensor *input, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NEGaussian3x3.cpp b/src/runtime/NEON/functions/NEGaussian3x3.cpp
index 95ba5cb..db8eb63 100644
--- a/src/runtime/NEON/functions/NEGaussian3x3.cpp
+++ b/src/runtime/NEON/functions/NEGaussian3x3.cpp

@@ -23,9 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGaussian3x3.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void NEGaussian3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<NEGaussian3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEGaussian3x3Kernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));

diff --git a/src/runtime/NEON/functions/NEGaussian5x5.cpp b/src/runtime/NEON/functions/NEGaussian5x5.cpp
index 5ccc765..b010ca0 100644
--- a/src/runtime/NEON/functions/NEGaussian5x5.cpp
+++ b/src/runtime/NEON/functions/NEGaussian5x5.cpp

@@ -32,17 +32,20 @@
 
 using namespace arm_compute;
 
-NEGaussian5x5::NEGaussian5x5()
-    : _kernel_hor(), _kernel_vert(), _tmp(), _border_handler()
+NEGaussian5x5::NEGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _kernel_hor(), _kernel_vert(), _tmp(), _border_handler()
 {
 }
 
 void NEGaussian5x5::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
     // Init temporary buffer
-    TensorInfo tensor_info(input->info()->tensor_shape(), Format::S16);
+    TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::S16);
     _tmp.allocator()->init(tensor_info);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_tmp);
+
     // Create and configure kernels for the two passes
     _kernel_hor.configure(input, &_tmp, border_mode == BorderMode::UNDEFINED);
     _kernel_vert.configure(&_tmp, output, border_mode == BorderMode::UNDEFINED);
@@ -54,7 +57,12 @@
 
 void NEGaussian5x5::run()
 {
-    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+
+    _memory_group.acquire();
+
     NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
     NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
index e1d64f1..84ea0ca 100644
--- a/src/runtime/NEON/functions/NEGaussianPyramid.cpp
+++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp

@@ -24,7 +24,6 @@
 #include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
 #include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
@@ -36,6 +35,7 @@
 #include "arm_compute/runtime/Pyramid.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
 
 #include <cstddef>
 
@@ -46,8 +46,10 @@
 {
 }
 
-NEGaussianPyramidHalf::NEGaussianPyramidHalf()
-    : _border_handler(), _horizontal_reduction(), _vertical_reduction()
+NEGaussianPyramidHalf::NEGaussianPyramidHalf() // NOLINT
+    : _border_handler(),
+      _horizontal_reduction(),
+      _vertical_reduction()
 {
 }
 
@@ -68,9 +70,9 @@
 
     if(num_levels > 1)
     {
-        _border_handler       = arm_compute::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
-        _horizontal_reduction = arm_compute::cpp14::make_unique<NEGaussianPyramidHorKernel[]>(num_levels - 1);
-        _vertical_reduction   = arm_compute::cpp14::make_unique<NEGaussianPyramidVertKernel[]>(num_levels - 1);
+        _border_handler       = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
+        _horizontal_reduction = arm_compute::support::cpp14::make_unique<NEGaussianPyramidHorKernel[]>(num_levels - 1);
+        _vertical_reduction   = arm_compute::support::cpp14::make_unique<NEGaussianPyramidVertKernel[]>(num_levels - 1);
 
         // Apply half scale to the X dimension of the tensor shape
         TensorShape tensor_shape = pyramid->info()->tensor_shape();
@@ -107,14 +109,15 @@
 
     for(unsigned int i = 0; i < num_levels - 1; ++i)
     {
-        _border_handler[i].run(_border_handler[i].window());
+        NEScheduler::get().schedule(_border_handler.get() + i, Window::DimZ);
         NEScheduler::get().schedule(_horizontal_reduction.get() + i, Window::DimY);
         NEScheduler::get().schedule(_vertical_reduction.get() + i, Window::DimY);
     }
 }
 
-NEGaussianPyramidOrb::NEGaussianPyramidOrb()
-    : _offsets(), _gaus5x5(), _scale_nearest()
+NEGaussianPyramidOrb::NEGaussianPyramidOrb() // NOLINT
+    : _gaus5x5(),
+      _scale_nearest()
 {
 }
 
@@ -135,30 +138,19 @@
 
     if(num_levels > 1)
     {
-        _gaus5x5       = arm_compute::cpp14::make_unique<NEGaussian5x5[]>(num_levels - 1);
-        _scale_nearest = arm_compute::cpp14::make_unique<NEScaleKernel[]>(num_levels - 1);
-        _offsets       = arm_compute::cpp14::make_unique<Image[]>(num_levels - 1);
+        _gaus5x5       = arm_compute::support::cpp14::make_unique<NEGaussian5x5[]>(num_levels - 1);
+        _scale_nearest = arm_compute::support::cpp14::make_unique<NEScale[]>(num_levels - 1);
 
         PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
         _tmp.init(pyramid_info);
 
         for(unsigned int i = 0; i < num_levels - 1; ++i)
         {
-            const size_t width  = _pyramid->get_pyramid_level(i + 1)->info()->dimension(0);
-            const size_t height = _pyramid->get_pyramid_level(i + 1)->info()->dimension(1);
-
-            /* Allocate Image for the offsets used by NEAREST interpolation */
-            TensorInfo tensor_info(TensorShape(width, height), Format::S32);
-            _offsets[i].allocator()->init(tensor_info);
-
             /* Configure gaussian 5x5 */
             _gaus5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
 
-            /* Configure scale image kernel */
-            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), nullptr, nullptr, _offsets.get() + i, _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR,
-                                        border_mode == BorderMode::UNDEFINED);
-
-            _offsets[i].allocator()->allocate();
+            /* Configure scale */
+            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED);
         }
 
         _tmp.allocate();
@@ -178,6 +170,6 @@
     for(unsigned int i = 0; i < num_levels - 1; ++i)
     {
         _gaus5x5[i].run();
-        NEScheduler::get().schedule(_scale_nearest.get() + i, Window::DimY);
+        _scale_nearest[i].run();
     }
 }

diff --git a/src/runtime/NEON/functions/NEHOGDescriptor.cpp b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
index a592f53..5e98269 100644
--- a/src/runtime/NEON/functions/NEHOGDescriptor.cpp
+++ b/src/runtime/NEON/functions/NEHOGDescriptor.cpp

@@ -31,8 +31,8 @@
 
 using namespace arm_compute;
 
-NEHOGDescriptor::NEHOGDescriptor()
-    : _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
+NEHOGDescriptor::NEHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
 {
 }
 
@@ -71,9 +71,16 @@
     TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
     _hog_space.allocator()->init(info_space);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_mag);
+    _memory_group.manage(&_phase);
+
     // Initialise gradient kernel
     _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_hog_space);
+
     // Initialise orientation binning kernel
     _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
 
@@ -88,6 +95,8 @@
 
 void NEHOGDescriptor::run()
 {
+    _memory_group.acquire();
+
     // Run gradient
     _gradient.run();
 
@@ -96,4 +105,6 @@
 
     // Run block normalization kernel
     NEScheduler::get().schedule(&_block_norm, Window::DimY);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEHOGDetector.cpp b/src/runtime/NEON/functions/NEHOGDetector.cpp
index e8ed29d..49d0778 100644
--- a/src/runtime/NEON/functions/NEHOGDetector.cpp
+++ b/src/runtime/NEON/functions/NEHOGDetector.cpp

@@ -23,14 +23,14 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEHOGDetector.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 void NEHOGDetector::configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
 {
-    auto k = arm_compute::cpp14::make_unique<NEHOGDetectorKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEHOGDetectorKernel>();
     k->configure(input, hog, detection_windows, detection_window_stride, threshold, idx_class);
     _kernel = std::move(k);
 }
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEHOGGradient.cpp b/src/runtime/NEON/functions/NEHOGGradient.cpp
index 2f4b880..efc8690 100644
--- a/src/runtime/NEON/functions/NEHOGGradient.cpp
+++ b/src/runtime/NEON/functions/NEHOGGradient.cpp

@@ -23,15 +23,19 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEHOGGradient.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-NEHOGGradient::NEHOGGradient()
-    : _derivative(), _mag_phase(nullptr), _gx(), _gy()
+NEHOGGradient::NEHOGGradient(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _derivative(),
+      _mag_phase(nullptr),
+      _gx(),
+      _gy()
 {
 }
 
@@ -48,19 +52,23 @@
     _gx.allocator()->init(info);
     _gy.allocator()->init(info);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_gx);
+    _memory_group.manage(&_gy);
+
     // Initialise derivate kernel
     _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
 
     // Initialise magnitude/phase kernel
     if(PhaseType::UNSIGNED == phase_type)
     {
-        auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>>();
+        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>>();
         k->configure(&_gx, &_gy, output_magnitude, output_phase);
         _mag_phase = std::move(k);
     }
     else
     {
-        auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
         k->configure(&_gx, &_gy, output_magnitude, output_phase);
         _mag_phase = std::move(k);
     }
@@ -72,9 +80,13 @@
 
 void NEHOGGradient::run()
 {
+    _memory_group.acquire();
+
     // Run derivative
     _derivative.run();
 
     // Run magnitude/phase kernel
     NEScheduler::get().schedule(_mag_phase.get(), Window::DimY);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
index 173b8f4..8c834e2 100644
--- a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
+++ b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp

@@ -24,16 +24,30 @@
 #include "arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-NEHOGMultiDetection::NEHOGMultiDetection()
-    : _gradient_kernel(), _orient_bin_kernel(), _block_norm_kernel(), _hog_detect_kernel(), _non_maxima_kernel(), _hog_space(), _hog_norm_space(), _detection_windows(), _mag(), _phase(),
-      _non_maxima_suppression(false), _num_orient_bin_kernel(0), _num_block_norm_kernel(0), _num_hog_detect_kernel(0)
+NEHOGMultiDetection::NEHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _gradient_kernel(),
+      _orient_bin_kernel(),
+      _block_norm_kernel(),
+      _hog_detect_kernel(),
+      _non_maxima_kernel(),
+      _hog_space(),
+      _hog_norm_space(),
+      _detection_windows(),
+      _mag(),
+      _phase(),
+      _non_maxima_suppression(false),
+      _num_orient_bin_kernel(0),
+      _num_block_norm_kernel(0),
+      _num_hog_detect_kernel(0)
 {
 }
 
@@ -112,12 +126,12 @@
     _num_block_norm_kernel  = input_block_norm.size(); // Number of NEHOGBlockNormalizationKernel kernels to compute
     _num_hog_detect_kernel  = input_hog_detect.size(); // Number of NEHOGDetector functions to compute
 
-    _orient_bin_kernel = arm_compute::cpp14::make_unique<NEHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
-    _block_norm_kernel = arm_compute::cpp14::make_unique<NEHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
-    _hog_detect_kernel = arm_compute::cpp14::make_unique<NEHOGDetector[]>(_num_hog_detect_kernel);
-    _non_maxima_kernel = arm_compute::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
-    _hog_space         = arm_compute::cpp14::make_unique<Tensor[]>(_num_orient_bin_kernel);
-    _hog_norm_space    = arm_compute::cpp14::make_unique<Tensor[]>(_num_block_norm_kernel);
+    _orient_bin_kernel = arm_compute::support::cpp14::make_unique<NEHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
+    _block_norm_kernel = arm_compute::support::cpp14::make_unique<NEHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
+    _hog_detect_kernel = arm_compute::support::cpp14::make_unique<NEHOGDetector[]>(_num_hog_detect_kernel);
+    _non_maxima_kernel = arm_compute::support::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
+    _hog_space         = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_orient_bin_kernel);
+    _hog_norm_space    = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_block_norm_kernel);
 
     // Allocate tensors for magnitude and phase
     TensorInfo info_mag(shape_img, Format::S16);
@@ -126,6 +140,10 @@
     TensorInfo info_phase(shape_img, Format::U8);
     _phase.allocator()->init(info_phase);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_mag);
+    _memory_group.manage(&_phase);
+
     // Initialise gradient kernel
     _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
 
@@ -151,10 +169,17 @@
         TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
         _hog_space[i].allocator()->init(info_space);
 
+        // Manage intermediate buffers
+        _memory_group.manage(_hog_space.get() + i);
+
         // Initialise orientation binning kernel
         _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
     }
 
+    // Allocate intermediate tensors
+    _mag.allocator()->allocate();
+    _phase.allocator()->allocate();
+
     // Configure NETensor for the normalized HOG space and block normalization kernel
     for(size_t i = 0; i < _num_block_norm_kernel; ++i)
     {
@@ -165,10 +190,19 @@
         TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
         _hog_norm_space[i].allocator()->init(tensor_info);
 
+        // Manage intermediate buffers
+        _memory_group.manage(_hog_norm_space.get() + i);
+
         // Initialize block normalization kernel
         _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
     }
 
+    // Allocate intermediate tensors
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        _hog_space[i].allocator()->allocate();
+    }
+
     // Configure HOG detector kernel
     for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
     {
@@ -181,14 +215,6 @@
     _non_maxima_kernel->configure(_detection_windows, min_distance);
 
     // Allocate intermediate tensors
-    _mag.allocator()->allocate();
-    _phase.allocator()->allocate();
-
-    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
-    {
-        _hog_space[i].allocator()->allocate();
-    }
-
     for(size_t i = 0; i < _num_block_norm_kernel; ++i)
     {
         _hog_norm_space[i].allocator()->allocate();
@@ -199,6 +225,8 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
 
+    _memory_group.acquire();
+
     // Reset detection window
     _detection_windows->clear();
 
@@ -226,6 +254,8 @@
     // Run non-maxima suppression kernel if enabled
     if(_non_maxima_suppression)
     {
-        _non_maxima_kernel->run(_non_maxima_kernel->window());
+        NEScheduler::get().schedule(_non_maxima_kernel.get(), Window::DimY);
     }
+
+    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
index b54fb67..25e28d2 100644
--- a/src/runtime/NEON/functions/NEHarrisCorners.cpp
+++ b/src/runtime/NEON/functions/NEHarrisCorners.cpp

@@ -24,7 +24,6 @@
 #include "arm_compute/runtime/NEON/functions/NEHarrisCorners.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -35,14 +34,28 @@
 #include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
 #include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
 
 #include <cmath>
 #include <utility>
 
 using namespace arm_compute;
 
-NEHarrisCorners::NEHarrisCorners()
-    : _sobel(), _harris_score(), _non_max_suppr(), _candidates(), _sort_euclidean(), _border_gx(), _border_gy(), _gx(), _gy(), _score(), _nonmax(), _corners_list(), _num_corner_candidates(0)
+NEHarrisCorners::NEHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _sobel(),
+      _harris_score(),
+      _non_max_suppr(),
+      _candidates(),
+      _sort_euclidean(),
+      _border_gx(),
+      _border_gy(),
+      _gx(),
+      _gy(),
+      _score(),
+      _nonmax(),
+      _corners_list(),
+      _num_corner_candidates(0)
 {
 }
 
@@ -69,32 +82,36 @@
     _gx.allocator()->init(tensor_info_gxgy);
     _gy.allocator()->init(tensor_info_gxgy);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_gx);
+    _memory_group.manage(&_gy);
+
     TensorInfo tensor_info_score(shape, Format::F32);
     _score.allocator()->init(tensor_info_score);
     _nonmax.allocator()->init(tensor_info_score);
 
-    _corners_list = arm_compute::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+    _corners_list = arm_compute::support::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
 
     // Set/init Sobel kernel accordingly with gradient_size
     switch(gradient_size)
     {
         case 3:
         {
-            auto k = arm_compute::cpp14::make_unique<NESobel3x3>();
+            auto k = arm_compute::support::cpp14::make_unique<NESobel3x3>();
             k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
             _sobel = std::move(k);
             break;
         }
         case 5:
         {
-            auto k = arm_compute::cpp14::make_unique<NESobel5x5>();
+            auto k = arm_compute::support::cpp14::make_unique<NESobel5x5>();
             k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
             _sobel = std::move(k);
             break;
         }
         case 7:
         {
-            auto k = arm_compute::cpp14::make_unique<NESobel7x7>();
+            auto k = arm_compute::support::cpp14::make_unique<NESobel7x7>();
             k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
             _sobel = std::move(k);
             break;
@@ -106,27 +123,30 @@
     // Normalization factor
     const float norm_factor = 1.0f / (255.0f * pow(4.0f, gradient_size / 2) * block_size);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_score);
+
     if(use_fp16)
     {
         switch(block_size)
         {
             case 3:
             {
-                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreFP16Kernel<3>>();
+                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreFP16Kernel<3>>();
                 k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
                 _harris_score = std::move(k);
             }
             break;
             case 5:
             {
-                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreFP16Kernel<5>>();
+                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreFP16Kernel<5>>();
                 k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
                 _harris_score = std::move(k);
             }
             break;
             case 7:
             {
-                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreFP16Kernel<7>>();
+                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreFP16Kernel<7>>();
                 k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
                 _harris_score = std::move(k);
             }
@@ -141,21 +161,21 @@
         {
             case 3:
             {
-                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreKernel<3>>();
+                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<3>>();
                 k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
                 _harris_score = std::move(k);
             }
             break;
             case 5:
             {
-                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreKernel<5>>();
+                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<5>>();
                 k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
                 _harris_score = std::move(k);
             }
             break;
             case 7:
             {
-                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreKernel<7>>();
+                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<7>>();
                 k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
                 _harris_score = std::move(k);
             }
@@ -168,26 +188,35 @@
     _border_gx.configure(&_gx, _harris_score->border_size(), border_mode, constant_border_value);
     _border_gy.configure(&_gy, _harris_score->border_size(), border_mode, constant_border_value);
 
+    // Allocate once all the configure methods have been called
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+
+    // Manage intermediate buffers
+    _memory_group.manage(&_nonmax);
+
     // Init non-maxima suppression function
     _non_max_suppr.configure(&_score, &_nonmax, border_mode);
 
+    // Allocate once all the configure methods have been called
+    _score.allocator()->allocate();
+
     // Init corner candidates kernel
     _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
 
+    // Allocate once all the configure methods have been called
+    _nonmax.allocator()->allocate();
+
     // Init euclidean distance
     _sort_euclidean.configure(_corners_list.get(), corners, &_num_corner_candidates, min_dist);
-
-    // Allocate once all the configure methods have been called
-    _gx.allocator()->allocate();
-    _gy.allocator()->allocate();
-    _score.allocator()->allocate();
-    _nonmax.allocator()->allocate();
 }
 
 void NEHarrisCorners::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
 
+    _memory_group.acquire();
+
     // Init to 0 number of corner candidates
     _num_corner_candidates = 0;
 
@@ -195,8 +224,8 @@
     _sobel->run();
 
     // Fill border before harris score kernel
-    _border_gx.run(_border_gx.window());
-    _border_gy.run(_border_gy.window());
+    NEScheduler::get().schedule(&_border_gx, Window::DimZ);
+    NEScheduler::get().schedule(&_border_gy, Window::DimZ);
 
     // Run harris score kernel
     NEScheduler::get().schedule(_harris_score.get(), Window::DimY);
@@ -208,5 +237,7 @@
     NEScheduler::get().schedule(&_candidates, Window::DimY);
 
     // Run sort & euclidean distance
-    _sort_euclidean.run(_sort_euclidean.window());
+    NEScheduler::get().schedule(&_sort_euclidean, Window::DimY);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEHistogram.cpp b/src/runtime/NEON/functions/NEHistogram.cpp
index c42b2a5..f333ecb 100644
--- a/src/runtime/NEON/functions/NEHistogram.cpp
+++ b/src/runtime/NEON/functions/NEHistogram.cpp

@@ -24,17 +24,17 @@
 #include "arm_compute/runtime/NEON/functions/NEHistogram.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IDistribution1D.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 NEHistogram::NEHistogram()
-    : _histogram_kernel(), _local_hist(), _window_lut(arm_compute::cpp14::make_unique<uint32_t[]>(window_lut_default_size)), _local_hist_size(0)
+    : _histogram_kernel(), _local_hist(), _window_lut(arm_compute::support::cpp14::make_unique<uint32_t[]>(window_lut_default_size)), _local_hist_size(0)
 {
 }
 
@@ -45,7 +45,7 @@
 
     // Allocate space for threads local histograms
     _local_hist_size = output->num_bins() * NEScheduler::get().num_threads();
-    _local_hist      = arm_compute::cpp14::make_unique<uint32_t[]>(_local_hist_size);
+    _local_hist      = arm_compute::support::cpp14::make_unique<uint32_t[]>(_local_hist_size);
 
     // Configure kernel
     _histogram_kernel.configure(input, output, _local_hist.get(), _window_lut.get());

diff --git a/src/runtime/NEON/functions/NEIntegralImage.cpp b/src/runtime/NEON/functions/NEIntegralImage.cpp
index af604e9..2e94ed5 100644
--- a/src/runtime/NEON/functions/NEIntegralImage.cpp
+++ b/src/runtime/NEON/functions/NEIntegralImage.cpp

@@ -23,9 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEIntegralImage.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h"
 #include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void NEIntegralImage::configure(const ITensor *input, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEIntegralImageKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEIntegralImageKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
     _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, 0);

diff --git a/src/runtime/NEON/functions/NEL2Normalize.cpp b/src/runtime/NEON/functions/NEL2Normalize.cpp
new file mode 100644
index 0000000..349a781
--- /dev/null
+++ b/src/runtime/NEON/functions/NEL2Normalize.cpp

@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEL2Normalize.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEL2Normalize::NEL2Normalize(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
+{
+}
+
+void NEL2Normalize::configure(ITensor *input, ITensor *output, unsigned int axis, float epsilon)
+{
+    // Manage intermediate buffers
+    _memory_group.manage(&_sumsq);
+
+    // Configure Kernels
+    _reduce_func.configure(input, &_sumsq, axis, ReductionOperation::SUM_SQUARE);
+    _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon);
+
+    // Allocate intermediate tensors
+    _sumsq.allocator()->allocate();
+}
+
+void NEL2Normalize::run()
+{
+    _memory_group.acquire();
+
+    _reduce_func.run();
+    NEScheduler::get().schedule(&_normalize_kernel, Window::DimY);
+
+    _memory_group.release();
+}

diff --git a/src/runtime/NEON/functions/NELaplacianPyramid.cpp b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
index 8232c79..a680f1f 100644
--- a/src/runtime/NEON/functions/NELaplacianPyramid.cpp
+++ b/src/runtime/NEON/functions/NELaplacianPyramid.cpp

@@ -24,7 +24,6 @@
 #include "arm_compute/runtime/NEON/functions/NELaplacianPyramid.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IPyramid.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
@@ -33,11 +32,18 @@
 #include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
 #include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
 #include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-NELaplacianPyramid::NELaplacianPyramid()
-    : _num_levels(0), _gaussian_pyr_function(), _convf(), _subf(), _gauss_pyr(), _conv_pyr(), _depth_function()
+NELaplacianPyramid::NELaplacianPyramid() // NOLINT
+    : _num_levels(0),
+      _gaussian_pyr_function(),
+      _convf(),
+      _subf(),
+      _gauss_pyr(),
+      _conv_pyr(),
+      _depth_function()
 {
 }
 
@@ -86,8 +92,8 @@
     // Create Gaussian Pyramid function
     _gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
 
-    _convf = arm_compute::cpp14::make_unique<NEGaussian5x5[]>(_num_levels);
-    _subf  = arm_compute::cpp14::make_unique<NEArithmeticSubtraction[]>(_num_levels);
+    _convf = arm_compute::support::cpp14::make_unique<NEGaussian5x5[]>(_num_levels);
+    _subf  = arm_compute::support::cpp14::make_unique<NEArithmeticSubtraction[]>(_num_levels);
 
     for(unsigned int i = 0; i < _num_levels; ++i)
     {

diff --git a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
index 36ac4a7..0893701 100644
--- a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
+++ b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp

@@ -24,18 +24,21 @@
 #include "arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IPyramid.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
 
 #include <cstddef>
 
 using namespace arm_compute;
 
-NELaplacianReconstruct::NELaplacianReconstruct()
-    : _tmp_pyr(), _addf(), _scalef(), _depthf()
+NELaplacianReconstruct::NELaplacianReconstruct() // NOLINT
+    : _tmp_pyr(),
+      _addf(),
+      _scalef(),
+      _depthf()
 {
 }
 
@@ -61,8 +64,8 @@
     _tmp_pyr.init(pyramid_info);
 
     // Allocate add and scale functions. Level 0 does not need to be scaled.
-    _addf   = arm_compute::cpp14::make_unique<NEArithmeticAddition[]>(num_levels);
-    _scalef = arm_compute::cpp14::make_unique<NEScale[]>(num_levels - 1);
+    _addf   = arm_compute::support::cpp14::make_unique<NEArithmeticAddition[]>(num_levels);
+    _scalef = arm_compute::support::cpp14::make_unique<NEScale[]>(num_levels - 1);
 
     const size_t last_level = num_levels - 1;
 

diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index 85d7ba3..cb48598 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp

@@ -33,8 +33,9 @@
 
 using namespace arm_compute;
 
-NELocallyConnectedLayer::NELocallyConnectedLayer()
-    : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false)
+NELocallyConnectedLayer::NELocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
+      _is_first_run(false)
 {
 }
 
@@ -65,11 +66,14 @@
     std::tie(stride_x, stride_y) = conv_info.stride();
     std::tie(pad_x, pad_y)       = conv_info.pad();
 
+    const unsigned int kernel_width  = weights->info()->dimension(0);
+    const unsigned int kernel_height = weights->info()->dimension(1);
+
     // Get convolved dimensions
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
-                                                 stride_x, stride_y, pad_x, pad_y, conv_info.round());
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
+                                                 conv_info);
 
     ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
     ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
@@ -99,8 +103,12 @@
     shape_gemm.set(1, mat_input_rows);
     _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_input_im2col_reshaped);
+    _memory_group.manage(&_gemm_output);
+
     // Configure kernels
-    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
     _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
     _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
     _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
@@ -120,6 +128,8 @@
         NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
     }
 
+    _memory_group.acquire();
+
     // Run input reshaping
     NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
 
@@ -128,4 +138,6 @@
 
     // Reshape output matrix
     NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEMagnitude.cpp b/src/runtime/NEON/functions/NEMagnitude.cpp
index 9390ca2..7877995 100644
--- a/src/runtime/NEON/functions/NEMagnitude.cpp
+++ b/src/runtime/NEON/functions/NEMagnitude.cpp

@@ -23,9 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEMagnitude.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
 #include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -35,13 +35,13 @@
 {
     if(use_fp16)
     {
-        auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
         k->configure(input1, input2, output, nullptr);
         _kernel = std::move(k);
     }
     else
     {
-        auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
         k->configure(input1, input2, output, nullptr);
         _kernel = std::move(k);
     }

diff --git a/src/runtime/NEON/functions/NEMeanStdDev.cpp b/src/runtime/NEON/functions/NEMeanStdDev.cpp
index 47143f5..2304bc8 100644
--- a/src/runtime/NEON/functions/NEMeanStdDev.cpp
+++ b/src/runtime/NEON/functions/NEMeanStdDev.cpp

@@ -23,19 +23,19 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDev.h"
 
-#include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 using namespace arm_compute;
 
 NEMeanStdDev::NEMeanStdDev()
-    : _mean_stddev_kernel(), _global_sum(0), _global_sum_squared(0)
+    : _mean_stddev_kernel(), _fill_border_kernel(), _global_sum(0), _global_sum_squared(0)
 {
 }
 
-void NEMeanStdDev::configure(const IImage *input, float *mean, float *stddev)
+void NEMeanStdDev::configure(IImage *input, float *mean, float *stddev)
 {
     _mean_stddev_kernel.configure(input, mean, &_global_sum, stddev, &_global_sum_squared);
+    _fill_border_kernel.configure(input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
 }
 
 void NEMeanStdDev::run()
@@ -43,5 +43,6 @@
     _global_sum         = 0;
     _global_sum_squared = 0;
 
+    NEScheduler::get().schedule(&_fill_border_kernel, Window::DimZ);
     NEScheduler::get().schedule(&_mean_stddev_kernel, Window::DimY);
 }

diff --git a/src/runtime/NEON/functions/NEMedian3x3.cpp b/src/runtime/NEON/functions/NEMedian3x3.cpp
index aa7cc97..627e5fb 100644
--- a/src/runtime/NEON/functions/NEMedian3x3.cpp
+++ b/src/runtime/NEON/functions/NEMedian3x3.cpp

@@ -23,9 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEMedian3x3.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void NEMedian3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<NEMedian3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEMedian3x3Kernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));

diff --git a/src/runtime/NEON/functions/NEMinMaxLocation.cpp b/src/runtime/NEON/functions/NEMinMaxLocation.cpp
index cab9200..54e89ab 100644
--- a/src/runtime/NEON/functions/NEMinMaxLocation.cpp
+++ b/src/runtime/NEON/functions/NEMinMaxLocation.cpp

@@ -32,7 +32,7 @@
 {
 }
 
-void NEMinMaxLocation::configure(const IImage *input, int32_t *min, int32_t *max, ICoordinates2DArray *min_loc, ICoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
+void NEMinMaxLocation::configure(const IImage *input, void *min, void *max, ICoordinates2DArray *min_loc, ICoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
 {
     _min_max.configure(input, min, max);
     _min_max_loc.configure(input, min, max, min_loc, max_loc, min_count, max_count);

diff --git a/src/runtime/NEON/functions/NENonLinearFilter.cpp b/src/runtime/NEON/functions/NENonLinearFilter.cpp
index 01aea3b..57bd4e7 100644
--- a/src/runtime/NEON/functions/NENonLinearFilter.cpp
+++ b/src/runtime/NEON/functions/NENonLinearFilter.cpp

@@ -23,9 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NENonLinearFilter.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -35,7 +35,7 @@
                                   BorderMode border_mode,
                                   uint8_t    constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<NENonLinearFilterKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NENonLinearFilterKernel>();
     k->configure(input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));

diff --git a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
index a7b3759..3b59820 100644
--- a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
+++ b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp

@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,16 +32,16 @@
 
 void NENonMaximaSuppression3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode)
 {
-    auto k = arm_compute::cpp14::make_unique<NENonMaximaSuppression3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NENonMaximaSuppression3x3Kernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
 
     if(border_mode != BorderMode::UNDEFINED)
     {
-        _border_handler.configure(input, 1, BorderMode::CONSTANT, 0);
+        _border_handler.configure(input, BorderSize(1), BorderMode::CONSTANT, 0);
     }
     else
     {
-        _border_handler.configure(input, 1, BorderMode::UNDEFINED, 0);
+        _border_handler.configure(input, BorderSize(1), BorderMode::UNDEFINED, 0);
     }
 }

diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index 69ff325..e01ef66 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp

@@ -32,8 +32,8 @@
 
 using namespace arm_compute;
 
-NENormalizationLayer::NENormalizationLayer()
-    : _norm_kernel(), _multiply_kernel(), _border_handler(), _input_squared()
+NENormalizationLayer::NENormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_kernel(), _border_handler(), _input_squared()
 {
 }
 
@@ -44,6 +44,9 @@
     TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
     _input_squared.allocator()->init(tensor_info);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_input_squared);
+
     // Configure kernels
     _norm_kernel.configure(input, &_input_squared, output, norm_info);
     _multiply_kernel.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
@@ -55,7 +58,11 @@
 
 void NENormalizationLayer::run()
 {
+    _memory_group.acquire();
+
     NEScheduler::get().schedule(&_multiply_kernel, Window::DimY);
     NEScheduler::get().schedule(&_border_handler, Window::DimY);
     NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEOpticalFlow.cpp b/src/runtime/NEON/functions/NEOpticalFlow.cpp
index 49135e4..e90d8f6 100644
--- a/src/runtime/NEON/functions/NEOpticalFlow.cpp
+++ b/src/runtime/NEON/functions/NEOpticalFlow.cpp

@@ -24,7 +24,6 @@
 #include "arm_compute/runtime/NEON/functions/NEOpticalFlow.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -34,11 +33,21 @@
 #include "arm_compute/runtime/Pyramid.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-NEOpticalFlow::NEOpticalFlow()
-    : _func_scharr(), _kernel_tracker(), _scharr_gx(), _scharr_gy(), _new_points(nullptr), _new_points_estimates(nullptr), _old_points(nullptr), _new_points_internal(), _old_points_internal(),
+NEOpticalFlow::NEOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _func_scharr(),
+      _kernel_tracker(),
+      _scharr_gx(),
+      _scharr_gy(),
+      _new_points(nullptr),
+      _new_points_estimates(nullptr),
+      _old_points(nullptr),
+      _new_points_internal(),
+      _old_points_internal(),
       _num_levels(0)
 {
 }
@@ -65,10 +74,10 @@
 
     const float pyr_scale = old_pyramid->info()->scale();
 
-    _func_scharr    = arm_compute::cpp14::make_unique<NEScharr3x3[]>(_num_levels);
-    _kernel_tracker = arm_compute::cpp14::make_unique<NELKTrackerKernel[]>(_num_levels);
-    _scharr_gx      = arm_compute::cpp14::make_unique<Tensor[]>(_num_levels);
-    _scharr_gy      = arm_compute::cpp14::make_unique<Tensor[]>(_num_levels);
+    _func_scharr    = arm_compute::support::cpp14::make_unique<NEScharr3x3[]>(_num_levels);
+    _kernel_tracker = arm_compute::support::cpp14::make_unique<NELKTrackerKernel[]>(_num_levels);
+    _scharr_gx      = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_levels);
+    _scharr_gy      = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_levels);
 
     _old_points_internal = LKInternalKeypointArray(old_points->num_values());
     _new_points_internal = LKInternalKeypointArray(old_points->num_values());
@@ -89,6 +98,10 @@
         _scharr_gx[i].allocator()->init(tensor_info);
         _scharr_gy[i].allocator()->init(tensor_info);
 
+        // Manage intermediate buffers
+        _memory_group.manage(_scharr_gx.get() + i);
+        _memory_group.manage(_scharr_gy.get() + i);
+
         // Init Scharr kernel
         _func_scharr[i].configure(old_ith_input, _scharr_gx.get() + i, _scharr_gy.get() + i, border_mode, constant_border_value);
 
@@ -108,6 +121,8 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
 
+    _memory_group.acquire();
+
     for(unsigned int level = _num_levels; level > 0; --level)
     {
         // Run Scharr kernel
@@ -116,4 +131,6 @@
         // Run Lucas-Kanade kernel
         NEScheduler::get().schedule(_kernel_tracker.get() + level - 1, Window::DimX);
     }
+
+    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEPhase.cpp b/src/runtime/NEON/functions/NEPhase.cpp
index 7683f46..436d22f 100644
--- a/src/runtime/NEON/functions/NEPhase.cpp
+++ b/src/runtime/NEON/functions/NEPhase.cpp

@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEPhase.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NEPhase::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+    auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
     k->configure(input1, input2, nullptr, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index 056d33b..2e2ea11 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp

@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
-    auto k = arm_compute::cpp14::make_unique<NEPixelWiseMultiplicationKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEPixelWiseMultiplicationKernel>();
     k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 6f0cc4f..4c4e11f 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp

@@ -23,15 +23,15 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
 {
     // Configure pooling kernel
-    auto k = arm_compute::cpp14::make_unique<NEPoolingLayerKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEPoolingLayerKernel>();
     k->configure(input, output, pool_info);
     _kernel = std::move(k);
 

diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
new file mode 100644
index 0000000..a131c48
--- /dev/null
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp

@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEQuantizationLayer::NEQuantizationLayer()
+    : _quantize_kernel(), _min_max_kernel(), _min_max()
+{
+}
+
+void NEQuantizationLayer::configure(const ITensor *input, ITensor *output)
+{
+    // Configure min-max kernel. _min_max tensor will be auto-configured within the kernel
+    _min_max_kernel.configure(input, &_min_max);
+
+    // Configure quantize kernel
+    _quantize_kernel.configure(input, output, &_min_max);
+
+    // Allocate min_max tensor
+    _min_max.allocator()->allocate();
+}
+
+void NEQuantizationLayer::run()
+{
+    // Reset min and max
+    _min_max_kernel.reset();
+
+    // Run min and max kernel
+    NEScheduler::get().schedule(&_min_max_kernel, Window::DimY);
+
+    // Run quantize kernel
+    NEScheduler::get().schedule(&_quantize_kernel, Window::DimY);
+}

diff --git a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
new file mode 100644
index 0000000..1f1400c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp

@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEROIPoolingLayer::NEROIPoolingLayer()
+    : _roi_kernel()
+{
+}
+
+void NEROIPoolingLayer::configure(const ITensor *input, const IROIArray *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+{
+    _roi_kernel.configure(input, rois, output, pool_info);
+}
+
+void NEROIPoolingLayer::run()
+{
+    NEScheduler::get().schedule(&_roi_kernel, Window::DimX);
+}

diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
new file mode 100644
index 0000000..45c3e5d
--- /dev/null
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp

@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+namespace
+{
+/** Define dimension to split the window
+ *
+ * @param[in] axis Reduction axis
+ *
+ * @return The dimension to split the window
+ */
+size_t reduction_window_split_dimension(unsigned int axis)
+{
+    switch(axis)
+    {
+        case 0:
+            return Window::DimY;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported reduction axis");
+    }
+}
+BorderMode reduction_operation_border_mode(ReductionOperation op)
+{
+    switch(op)
+    {
+        case ReductionOperation::SUM_SQUARE:
+            return BorderMode::CONSTANT;
+        default:
+            return BorderMode::CONSTANT;
+    }
+}
+} // namespace
+
+NEReductionOperation::NEReductionOperation()
+    : _reduction_kernel(), _fill_border_kernel(), _window_split(0)
+{
+}
+
+void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+
+    // Configure reduction kernel
+    _reduction_kernel.configure(input, output, axis, op);
+    _window_split = reduction_window_split_dimension(axis);
+
+    // Configure fill border kernel
+    BorderSize fill_border_size = (axis == 0) ? _reduction_kernel.border_size() : BorderSize();
+    BorderMode fill_border_mode = reduction_operation_border_mode(op);
+    _fill_border_kernel.configure(input, fill_border_size, fill_border_mode, PixelValue(0));
+}
+
+void NEReductionOperation::run()
+{
+    NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_reduction_kernel, _window_split);
+}

diff --git a/src/runtime/NEON/functions/NERemap.cpp b/src/runtime/NEON/functions/NERemap.cpp
index 9f06fb6..882e93b 100644
--- a/src/runtime/NEON/functions/NERemap.cpp
+++ b/src/runtime/NEON/functions/NERemap.cpp

@@ -24,13 +24,13 @@
 #include "arm_compute/runtime/NEON/functions/NERemap.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/NERemapKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -44,7 +44,7 @@
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
 
-    auto k = arm_compute::cpp14::make_unique<NERemapKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NERemapKernel>();
 
     k->configure(input, map_x, map_y, output, policy);
 

diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp
new file mode 100644
index 0000000..fef4e0c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp

@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+
+#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEReshapeLayer::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEReshapeLayerKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}

diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index b70f626..bbd3fac 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp

@@ -27,11 +27,12 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
 
 #include <cmath>
 #include <cstddef>
@@ -85,12 +86,16 @@
 }
 } // namespace
 
-NEScale::NEScale()
-    : _offsets(), _dx(), _dy()
+NEScale::NEScale() // NOLINT
+    : _offsets(),
+      _dx(),
+      _dy(),
+      _scale_kernel(),
+      _border_handler()
 {
 }
 
-void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == input);
     ARM_COMPUTE_ERROR_ON(nullptr == output);
@@ -116,8 +121,6 @@
         policy = InterpolationPolicy::NEAREST_NEIGHBOR;
     }
 
-    auto k = arm_compute::cpp14::make_unique<NEScaleKernel>();
-
     // Check if the border mode is UNDEFINED
     const bool border_undefined = border_mode == BorderMode::UNDEFINED;
 
@@ -128,7 +131,7 @@
             TensorInfo tensor_info_offsets(shape, Format::S32);
             _offsets.allocator()->init(tensor_info_offsets);
 
-            k->configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined);
+            _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
@@ -146,7 +149,7 @@
             _dx.allocator()->init(tensor_info_dxdy);
             _dy.allocator()->init(tensor_info_dxdy);
 
-            k->configure(input, &_dx, &_dy, &_offsets, output, policy, border_undefined);
+            _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_undefined);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
@@ -159,13 +162,18 @@
         }
         case InterpolationPolicy::AREA:
         {
-            k->configure(input, nullptr, nullptr, nullptr, output, policy, border_undefined);
+            _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, policy, border_undefined);
             break;
         }
         default:
             ARM_COMPUTE_ERROR("Unsupported interpolation mode");
     }
 
-    _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(input, _scale_kernel.border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+void NEScale::run()
+{
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    NEScheduler::get().schedule(&_scale_kernel, Window::DimY);
 }

diff --git a/src/runtime/NEON/functions/NEScharr3x3.cpp b/src/runtime/NEON/functions/NEScharr3x3.cpp
index 04b3f14..ba9985e 100644
--- a/src/runtime/NEON/functions/NEScharr3x3.cpp
+++ b/src/runtime/NEON/functions/NEScharr3x3.cpp

@@ -23,9 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEScharr3x3.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void NEScharr3x3::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<NEScharr3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEScharr3x3Kernel>();
     k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));

diff --git a/src/runtime/NEON/functions/NESobel3x3.cpp b/src/runtime/NEON/functions/NESobel3x3.cpp
index 3b46fd7..753b1f6 100644
--- a/src/runtime/NEON/functions/NESobel3x3.cpp
+++ b/src/runtime/NEON/functions/NESobel3x3.cpp

@@ -23,9 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NESobel3x3.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NESobel3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -33,7 +33,7 @@
 
 void NESobel3x3::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
 {
-    auto k = arm_compute::cpp14::make_unique<NESobel3x3Kernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NESobel3x3Kernel>();
     k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));

diff --git a/src/runtime/NEON/functions/NESobel5x5.cpp b/src/runtime/NEON/functions/NESobel5x5.cpp
index 8967a22..d8f4eda 100644
--- a/src/runtime/NEON/functions/NESobel5x5.cpp
+++ b/src/runtime/NEON/functions/NESobel5x5.cpp

@@ -32,8 +32,8 @@
 
 using namespace arm_compute;
 
-NESobel5x5::NESobel5x5()
-    : _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
+NESobel5x5::NESobel5x5(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
 {
 }
 
@@ -50,6 +50,8 @@
     {
         _tmp_x.allocator()->init(tensor_info);
         _tmp_y.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_x);
+        _memory_group.manage(&_tmp_y);
         _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
@@ -58,6 +60,7 @@
     else if(run_sobel_x)
     {
         _tmp_x.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_x);
         _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
@@ -65,6 +68,7 @@
     else if(run_sobel_y)
     {
         _tmp_y.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_y);
         _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_y.allocator()->allocate();
@@ -75,7 +79,12 @@
 
 void NESobel5x5::run()
 {
-    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+
+    _memory_group.acquire();
+
     NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
     NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NESobel7x7.cpp b/src/runtime/NEON/functions/NESobel7x7.cpp
index f628da9..5b6f60b 100644
--- a/src/runtime/NEON/functions/NESobel7x7.cpp
+++ b/src/runtime/NEON/functions/NESobel7x7.cpp

@@ -32,8 +32,8 @@
 
 using namespace arm_compute;
 
-NESobel7x7::NESobel7x7()
-    : _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
+NESobel7x7::NESobel7x7(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
 {
 }
 
@@ -50,6 +50,8 @@
     {
         _tmp_x.allocator()->init(tensor_info);
         _tmp_y.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_x);
+        _memory_group.manage(&_tmp_y);
         _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
@@ -58,6 +60,7 @@
     else if(run_sobel_x)
     {
         _tmp_x.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_x);
         _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
@@ -65,6 +68,7 @@
     else if(run_sobel_y)
     {
         _tmp_y.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_y);
         _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_y.allocator()->allocate();
@@ -75,7 +79,12 @@
 
 void NESobel7x7::run()
 {
-    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+
+    _memory_group.acquire();
+
     NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
     NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 0651eab..cc5d4e9 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp

@@ -31,15 +31,14 @@
 
 using namespace arm_compute;
 
-NESoftmaxLayer::NESoftmaxLayer()
-    : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _fill_border_kernel(), _max(), _sum(), _tmp()
+NESoftmaxLayer::NESoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _fill_border_kernel(), _max(), _sum(), _tmp()
 {
 }
 
 void NESoftmaxLayer::configure(ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
 
     // Create intermediate tensors shapes
     TensorInfo tensor_info_tmp(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
@@ -51,11 +50,16 @@
     _max.allocator()->init(tensor_info_max_sum);
     _sum.allocator()->init(tensor_info_max_sum);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_tmp);
+    _memory_group.manage(&_max);
+    _memory_group.manage(&_sum);
+
     // Configure Kernels
     _max_kernel.configure(input, &_max);
     _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
     _norm_kernel.configure(&_tmp, &_sum, output);
-    _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::CONSTANT, PixelValue(-FLT_MAX));
+    _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::REPLICATE);
 
     // Allocate intermediate tensors
     _tmp.allocator()->allocate();
@@ -65,8 +69,12 @@
 
 void NESoftmaxLayer::run()
 {
+    _memory_group.acquire();
+
     NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
     NEScheduler::get().schedule(&_max_kernel, Window::DimY);
     NEScheduler::get().schedule(&_shift_exp_sum_kernel, Window::DimY);
     NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NETableLookup.cpp b/src/runtime/NEON/functions/NETableLookup.cpp
index ebb8a0a..cae117a 100644
--- a/src/runtime/NEON/functions/NETableLookup.cpp
+++ b/src/runtime/NEON/functions/NETableLookup.cpp

@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NETableLookup.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NETableLookupKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NETableLookup::configure(const ITensor *input, const ILut *lut, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NETableLookupKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NETableLookupKernel>();
     k->configure(input, lut, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NEThreshold.cpp b/src/runtime/NEON/functions/NEThreshold.cpp
index 93dc124..37883e5 100644
--- a/src/runtime/NEON/functions/NEThreshold.cpp
+++ b/src/runtime/NEON/functions/NEThreshold.cpp

@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEThreshold.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEThresholdKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NEThreshold::configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
 {
-    auto k = arm_compute::cpp14::make_unique<NEThresholdKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEThresholdKernel>();
     k->configure(input, output, threshold, false_value, true_value, type, upper);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp
index 53ac9c5..eb81e02 100644
--- a/src/runtime/NEON/functions/NETranspose.cpp
+++ b/src/runtime/NEON/functions/NETranspose.cpp

@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -32,7 +32,7 @@
 
 void NETranspose::configure(const ITensor *input, ITensor *output)
 {
-    auto k = arm_compute::cpp14::make_unique<NETransposeKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NETransposeKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NEWarpAffine.cpp b/src/runtime/NEON/functions/NEWarpAffine.cpp
index 24fb16f..889d827 100644
--- a/src/runtime/NEON/functions/NEWarpAffine.cpp
+++ b/src/runtime/NEON/functions/NEWarpAffine.cpp

@@ -24,8 +24,9 @@
 #include "arm_compute/runtime/NEON/functions/NEWarpAffine.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -41,14 +42,14 @@
     {
         case InterpolationPolicy::NEAREST_NEIGHBOR:
         {
-            auto k = arm_compute::cpp14::make_unique<NEWarpAffineKernel<InterpolationPolicy::NEAREST_NEIGHBOR>>();
+            auto k = arm_compute::support::cpp14::make_unique<NEWarpAffineKernel<InterpolationPolicy::NEAREST_NEIGHBOR>>();
             k->configure(input, output, matrix, border_mode, constant_border_value);
             _kernel = std::move(k);
             break;
         }
         case InterpolationPolicy::BILINEAR:
         {
-            auto k = arm_compute::cpp14::make_unique<NEWarpAffineKernel<InterpolationPolicy::BILINEAR>>();
+            auto k = arm_compute::support::cpp14::make_unique<NEWarpAffineKernel<InterpolationPolicy::BILINEAR>>();
             k->configure(input, output, matrix, border_mode, constant_border_value);
             _kernel = std::move(k);
             break;

diff --git a/src/runtime/NEON/functions/NEWarpPerspective.cpp b/src/runtime/NEON/functions/NEWarpPerspective.cpp
index 84b2df5..ed5d6a0 100644
--- a/src/runtime/NEON/functions/NEWarpPerspective.cpp
+++ b/src/runtime/NEON/functions/NEWarpPerspective.cpp

@@ -24,8 +24,9 @@
 #include "arm_compute/runtime/NEON/functions/NEWarpPerspective.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
 
 #include <utility>
 
@@ -41,14 +42,14 @@
     {
         case InterpolationPolicy::NEAREST_NEIGHBOR:
         {
-            auto k = arm_compute::cpp14::make_unique<NEWarpPerspectiveKernel<InterpolationPolicy::NEAREST_NEIGHBOR>>();
+            auto k = arm_compute::support::cpp14::make_unique<NEWarpPerspectiveKernel<InterpolationPolicy::NEAREST_NEIGHBOR>>();
             k->configure(input, output, matrix, border_mode, constant_border_value);
             _kernel = std::move(k);
             break;
         }
         case InterpolationPolicy::BILINEAR:
         {
-            auto k = arm_compute::cpp14::make_unique<NEWarpPerspectiveKernel<InterpolationPolicy::BILINEAR>>();
+            auto k = arm_compute::support::cpp14::make_unique<NEWarpPerspectiveKernel<InterpolationPolicy::BILINEAR>>();
             k->configure(input, output, matrix, border_mode, constant_border_value);
             _kernel = std::move(k);
             break;

diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index 0cced73..be81641 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp

@@ -38,7 +38,7 @@
     return scheduler;
 }
 
-OMPScheduler::OMPScheduler()
+OMPScheduler::OMPScheduler() // NOLINT
     : _num_threads(omp_get_max_threads())
 {
 }

diff --git a/src/runtime/PoolManager.cpp b/src/runtime/PoolManager.cpp
new file mode 100644
index 0000000..42cc943
--- /dev/null
+++ b/src/runtime/PoolManager.cpp

@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/PoolManager.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/IMemoryPool.h"
+#include "support/ToolchainSupport.h"
+
+#include <list>
+
+using namespace arm_compute;
+
+PoolManager::PoolManager()
+    : _free_pools(), _occupied_pools(), _sem(), _mtx()
+{
+}
+
+IMemoryPool *PoolManager::lock_pool()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_free_pools.empty() && _occupied_pools.empty(), "Haven't setup any pools!");
+
+    _sem->wait();
+    std::lock_guard<arm_compute::Mutex> lock(_mtx);
+    ARM_COMPUTE_ERROR_ON_MSG(_free_pools.empty(), "Empty pool must exist as semaphore has been signalled");
+    _occupied_pools.splice(std::begin(_occupied_pools), _free_pools, std::begin(_free_pools));
+    return _occupied_pools.front().get();
+}
+
+void PoolManager::unlock_pool(IMemoryPool *pool)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_free_pools.empty() && _occupied_pools.empty(), "Haven't setup any pools!");
+
+    std::lock_guard<arm_compute::Mutex> lock(_mtx);
+    auto it = std::find_if(std::begin(_occupied_pools), std::end(_occupied_pools), [pool](const std::unique_ptr<IMemoryPool> &pool_it)
+    {
+        return pool_it.get() == pool;
+    });
+    ARM_COMPUTE_ERROR_ON_MSG(it == std::end(_occupied_pools), "Pool to be unlocked couldn't be found!");
+    _free_pools.splice(std::begin(_free_pools), _occupied_pools, it);
+    _sem->signal();
+}
+
+void PoolManager::register_pool(std::unique_ptr<IMemoryPool> pool)
+{
+    std::lock_guard<arm_compute::Mutex> lock(_mtx);
+    ARM_COMPUTE_ERROR_ON_MSG(!_occupied_pools.empty(), "All pools should be free in order to register a new one!");
+
+    // Set pool
+    _free_pools.push_front(std::move(pool));
+
+    // Update semaphore
+    _sem = arm_compute::support::cpp14::make_unique<arm_compute::Semaphore>(_free_pools.size());
+}

diff --git a/src/runtime/Pyramid.cpp b/src/runtime/Pyramid.cpp
index f1b6c93..ebd6570 100644
--- a/src/runtime/Pyramid.cpp
+++ b/src/runtime/Pyramid.cpp

@@ -24,10 +24,10 @@
 #include "arm_compute/runtime/Pyramid.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PyramidInfo.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
+#include "support/ToolchainSupport.h"
 
 #include <cmath>
 
@@ -46,7 +46,7 @@
 void Pyramid::internal_init(const PyramidInfo &info, bool auto_padding)
 {
     _info    = info;
-    _pyramid = arm_compute::cpp14::make_unique<Tensor[]>(_info.num_levels());
+    _pyramid = arm_compute::support::cpp14::make_unique<Tensor[]>(_info.num_levels());
 
     size_t      w            = _info.width();
     size_t      h            = _info.height();

diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
index a131928..505c4a3 100644
--- a/src/runtime/Scheduler.cpp
+++ b/src/runtime/Scheduler.cpp

@@ -26,13 +26,13 @@
 #include "arm_compute/core/Error.h"
 #if ARM_COMPUTE_CPP_SCHEDULER
 #include "arm_compute/runtime/CPP/CPPScheduler.h"
-#endif
+#endif /* ARM_COMPUTE_CPP_SCHEDULER */
 
 #include "arm_compute/runtime/SingleThreadScheduler.h"
 
 #if ARM_COMPUTE_OPENMP_SCHEDULER
 #include "arm_compute/runtime/OMP/OMPScheduler.h"
-#endif
+#endif /* ARM_COMPUTE_OPENMP_SCHEDULER */
 
 using namespace arm_compute;
 
@@ -42,9 +42,9 @@
 Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP;
 #elif ARM_COMPUTE_CPP_SCHEDULER && ARM_COMPUTE_OPENMP_SCHEDULER
 Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP;
-#else
+#else  /* ARM_COMPUTE_*_SCHEDULER */
 Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::ST;
-#endif
+#endif /* ARM_COMPUTE_*_SCHEDULER */
 
 void Scheduler::set(Type t)
 {
@@ -64,17 +64,17 @@
         {
 #if ARM_COMPUTE_CPP_SCHEDULER
             return true;
-#else
+#else  /* ARM_COMPUTE_CPP_SCHEDULER */
             return false;
-#endif
+#endif /* ARM_COMPUTE_CPP_SCHEDULER */
         }
         case Type::OMP:
         {
 #if ARM_COMPUTE_OPENMP_SCHEDULER
             return true;
-#else
+#else  /* ARM_COMPUTE_OPENMP_SCHEDULER */
             return false;
-#endif
+#endif /* ARM_COMPUTE_OPENMP_SCHEDULER */
         }
         case Type::CUSTOM:
         {
@@ -105,18 +105,18 @@
         {
 #if ARM_COMPUTE_CPP_SCHEDULER
             return CPPScheduler::get();
-#else
+#else  /* ARM_COMPUTE_CPP_SCHEDULER */
             ARM_COMPUTE_ERROR("Recompile with cppthreads=1 to use C++11 scheduler.");
-#endif
+#endif /* ARM_COMPUTE_CPP_SCHEDULER */
             break;
         }
         case Type::OMP:
         {
 #if ARM_COMPUTE_OPENMP_SCHEDULER
             return OMPScheduler::get();
-#else
+#else  /* ARM_COMPUTE_OPENMP_SCHEDULER */
             ARM_COMPUTE_ERROR("Recompile with openmp=1 to use openmp scheduler.");
-#endif
+#endif /* ARM_COMPUTE_OPENMP_SCHEDULER */
             break;
         }
         case Type::CUSTOM:

diff --git a/src/runtime/Tensor.cpp b/src/runtime/Tensor.cpp
index 435068c..a76c37e 100644
--- a/src/runtime/Tensor.cpp
+++ b/src/runtime/Tensor.cpp

@@ -26,7 +26,7 @@
 using namespace arm_compute;
 
 Tensor::Tensor()
-    : _allocator()
+    : _allocator(this)
 {
 }
 

diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index 5c719c7..272b9f5 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 
 #include <cstddef>
 
@@ -63,11 +64,50 @@
 }
 } // namespace
 
-TensorAllocator::TensorAllocator()
-    : _buffer(nullptr)
+TensorAllocator::TensorAllocator(Tensor *owner)
+    : _associated_memory_group(nullptr), _buffer(nullptr), _owner(owner)
 {
 }
 
+TensorAllocator::~TensorAllocator()
+{
+    if((_associated_memory_group == nullptr) && (_buffer != nullptr))
+    {
+        delete[] _buffer;
+        _buffer = nullptr;
+        info().set_is_resizable(true);
+    }
+}
+
+TensorAllocator::TensorAllocator(TensorAllocator &&o) noexcept
+    : ITensorAllocator(std::move(o)),
+      _associated_memory_group(o._associated_memory_group),
+      _buffer(o._buffer),
+      _owner(o._owner)
+{
+    o._associated_memory_group = nullptr;
+    o._buffer                  = nullptr;
+    o._owner                   = nullptr;
+}
+
+TensorAllocator &TensorAllocator::operator=(TensorAllocator &&o) noexcept
+{
+    if(&o != this)
+    {
+        _associated_memory_group   = o._associated_memory_group;
+        o._associated_memory_group = nullptr;
+
+        _buffer   = o._buffer;
+        o._buffer = nullptr;
+
+        _owner   = o._owner;
+        o._owner = nullptr;
+
+        ITensorAllocator::operator=(std::move(o));
+    }
+    return *this;
+}
+
 void TensorAllocator::init(const TensorAllocator &allocator, const Coordinates &coords, TensorInfo sub_info)
 {
     // Get parent info
@@ -90,28 +130,44 @@
 
 uint8_t *TensorAllocator::data() const
 {
-    return (_buffer != nullptr) ? _buffer.get()->data() : nullptr;
+    return _buffer;
 }
 
 void TensorAllocator::allocate()
 {
     ARM_COMPUTE_ERROR_ON(_buffer != nullptr);
-
-    _buffer = std::make_shared<std::vector<uint8_t>>(info().total_size());
+    if(_associated_memory_group == nullptr)
+    {
+        _buffer = new uint8_t[info().total_size()]();
+    }
+    else
+    {
+        _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(&_buffer), info().total_size());
+    }
     info().set_is_resizable(false);
 }
 
 void TensorAllocator::free()
 {
-    ARM_COMPUTE_ERROR_ON(_buffer == nullptr);
+    if((_associated_memory_group == nullptr) && (_buffer != nullptr))
+    {
+        delete[] _buffer;
+        _buffer = nullptr;
+        info().set_is_resizable(true);
+    }
+}
 
-    _buffer.reset();
-    info().set_is_resizable(true);
+void TensorAllocator::set_associated_memory_group(MemoryGroup *associated_memory_group)
+{
+    ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
+    ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr);
+    ARM_COMPUTE_ERROR_ON(_buffer != nullptr);
+    _associated_memory_group = associated_memory_group;
 }
 
 uint8_t *TensorAllocator::lock()
 {
-    return (_buffer != nullptr) ? _buffer.get()->data() : nullptr;
+    return _buffer;
 }
 
 void TensorAllocator::unlock()
commit	8938bd3f40ea62ff56d6ed4e2db0a8aee34dd64a	[log] [tgz]
author	Kaizen <kaizen@arm.com>	Thu Sep 28 14:38:23 2017 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	Thu Sep 28 16:31:13 2017 +0100
tree	c234331232f227e0cdfb567a54ecaa5460aaa064
parent	f4a254c2745aeaab6f7276a675147d707002fe7a [diff]